add paletted source support
For the moment, looks like we only need an 8-bit palette with an 8888
payload.
Some little updates to bench.c to keep it sane with palettes.
Bug: skia:8473
Change-Id: I4efffea3273b5565954c0d20a666979f11ea685b
Reviewed-on: https://skia-review.googlesource.com/c/163170
Reviewed-by: Brian Osman <brianosman@google.com>
Reviewed-by: Leon Scroggins <scroggo@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/bench.c b/bench.c
index 64ce3c8..d67a916 100644
--- a/bench.c
+++ b/bench.c
@@ -12,6 +12,7 @@
#endif
#include "skcms.h"
+#include "skcms_internal.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
@@ -70,13 +71,22 @@
dst_fmt = skcms_PixelFormat_RGB_565;
const int wrap = skcms_PixelFormat_BGRA_ffff+1;
+ uint32_t palette[256];
+ for (int i = 0; i < 256; i++) {
+ palette[i] = (uint32_t)(255 - i%256) * 0x01010101;
+ }
+
clock_t start = clock();
+ bool all_ok = true;
for (int i = 0; i < n; i++) {
- (void)skcms_Transform(src_pixels, src_fmt, skcms_AlphaFormat_Unpremul, &src_profile,
- dst_pixels, dst_fmt, skcms_AlphaFormat_Unpremul, &dst_profile,
- NPIXELS);
+ const skcms_AlphaFormat upm = skcms_AlphaFormat_Unpremul;
+ all_ok &= skcms_TransformWithPalette(src_pixels, src_fmt, upm, &src_profile,
+ dst_pixels, dst_fmt, upm, &dst_profile,
+ NPIXELS, palette);
src_fmt = (src_fmt + 3) % wrap;
- dst_fmt = (dst_fmt + 7) % wrap;
+ do {
+ dst_fmt = (dst_fmt + 7) % wrap;
+ } while (needs_palette(dst_fmt));
}
clock_t ticks = clock() - start;
@@ -86,5 +96,5 @@
free(src_buf);
free(dst_buf);
- return 0;
+ return all_ok ? 0 : 1;
}
diff --git a/skcms.cc b/skcms.cc
index fb42ecb..2b1ca69 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -1752,6 +1752,7 @@
typedef enum {
Op_load_a8,
Op_load_g8,
+ Op_load_8888_palette8,
Op_load_4444,
Op_load_565,
Op_load_888,
@@ -1979,21 +1980,22 @@
static size_t bytes_per_pixel(skcms_PixelFormat fmt) {
switch (fmt >> 1) { // ignore rgb/bgr
- case skcms_PixelFormat_A_8 >> 1: return 1;
- case skcms_PixelFormat_G_8 >> 1: return 1;
- case skcms_PixelFormat_ABGR_4444 >> 1: return 2;
- case skcms_PixelFormat_RGB_565 >> 1: return 2;
- case skcms_PixelFormat_RGB_888 >> 1: return 3;
- case skcms_PixelFormat_RGBA_8888 >> 1: return 4;
- case skcms_PixelFormat_RGBA_1010102 >> 1: return 4;
- case skcms_PixelFormat_RGB_161616LE >> 1: return 6;
- case skcms_PixelFormat_RGBA_16161616LE >> 1: return 8;
- case skcms_PixelFormat_RGB_161616BE >> 1: return 6;
- case skcms_PixelFormat_RGBA_16161616BE >> 1: return 8;
- case skcms_PixelFormat_RGB_hhh >> 1: return 6;
- case skcms_PixelFormat_RGBA_hhhh >> 1: return 8;
- case skcms_PixelFormat_RGB_fff >> 1: return 12;
- case skcms_PixelFormat_RGBA_ffff >> 1: return 16;
+ case skcms_PixelFormat_A_8 >> 1: return 1;
+ case skcms_PixelFormat_G_8 >> 1: return 1;
+ case skcms_PixelFormat_RGBA_8888_Palette8 >> 1: return 1;
+ case skcms_PixelFormat_ABGR_4444 >> 1: return 2;
+ case skcms_PixelFormat_RGB_565 >> 1: return 2;
+ case skcms_PixelFormat_RGB_888 >> 1: return 3;
+ case skcms_PixelFormat_RGBA_8888 >> 1: return 4;
+ case skcms_PixelFormat_RGBA_1010102 >> 1: return 4;
+ case skcms_PixelFormat_RGB_161616LE >> 1: return 6;
+ case skcms_PixelFormat_RGBA_16161616LE >> 1: return 8;
+ case skcms_PixelFormat_RGB_161616BE >> 1: return 6;
+ case skcms_PixelFormat_RGBA_16161616BE >> 1: return 8;
+ case skcms_PixelFormat_RGB_hhh >> 1: return 6;
+ case skcms_PixelFormat_RGBA_hhhh >> 1: return 8;
+ case skcms_PixelFormat_RGB_fff >> 1: return 12;
+ case skcms_PixelFormat_RGBA_ffff >> 1: return 16;
}
assert(false);
return 0;
@@ -2025,7 +2027,22 @@
skcms_PixelFormat dstFmt,
skcms_AlphaFormat dstAlpha,
const skcms_ICCProfile* dstProfile,
- size_t nz) {
+ size_t npixels) {
+ return skcms_TransformWithPalette(src, srcFmt, srcAlpha, srcProfile,
+ dst, dstFmt, dstAlpha, dstProfile,
+ npixels, nullptr);
+}
+
+bool skcms_TransformWithPalette(const void* src,
+ skcms_PixelFormat srcFmt,
+ skcms_AlphaFormat srcAlpha,
+ const skcms_ICCProfile* srcProfile,
+ void* dst,
+ skcms_PixelFormat dstFmt,
+ skcms_AlphaFormat dstAlpha,
+ const skcms_ICCProfile* dstProfile,
+ size_t nz,
+ const void* palette) {
const size_t dst_bpp = bytes_per_pixel(dstFmt),
src_bpp = bytes_per_pixel(srcFmt);
// Let's just refuse if the request is absurdly big.
@@ -2048,6 +2065,10 @@
}
// TODO: more careful alias rejection (like, dst == src + 1)?
+ if (needs_palette(srcFmt) && !palette) {
+ return false;
+ }
+
Op program [32];
const void* arguments[32];
@@ -2074,6 +2095,10 @@
case skcms_PixelFormat_RGBA_hhhh >> 1: *ops++ = Op_load_hhhh; break;
case skcms_PixelFormat_RGB_fff >> 1: *ops++ = Op_load_fff; break;
case skcms_PixelFormat_RGBA_ffff >> 1: *ops++ = Op_load_ffff; break;
+
+ case skcms_PixelFormat_RGBA_8888_Palette8 >> 1: *ops++ = Op_load_8888_palette8;
+ *args++ = palette;
+ break;
}
if (srcFmt & 1) {
*ops++ = Op_swap_rb;
diff --git a/skcms.h b/skcms.h
index cbabfc8..d924f34 100644
--- a/skcms.h
+++ b/skcms.h
@@ -175,6 +175,8 @@
skcms_PixelFormat_A_8_,
skcms_PixelFormat_G_8,
skcms_PixelFormat_G_8_,
+ skcms_PixelFormat_RGBA_8888_Palette8,
+ skcms_PixelFormat_BGRA_8888_Palette8,
skcms_PixelFormat_RGB_565,
skcms_PixelFormat_BGR_565,
@@ -248,6 +250,18 @@
const skcms_ICCProfile* dstProfile,
size_t npixels);
+// As skcms_Transform(), supporting srcFmts with a palette.
+SKCMS_API bool skcms_TransformWithPalette(const void* src,
+ skcms_PixelFormat srcFmt,
+ skcms_AlphaFormat srcAlpha,
+ const skcms_ICCProfile* srcProfile,
+ void* dst,
+ skcms_PixelFormat dstFmt,
+ skcms_AlphaFormat dstAlpha,
+ const skcms_ICCProfile* dstProfile,
+ size_t npixels,
+ const void* palette);
+
// If profile can be used as a destination in skcms_Transform, return true. Otherwise, attempt to
// rewrite it with approximations where reasonable. If successful, return true. If no reasonable
// approximation exists, leave the profile unchanged and return false.
diff --git a/skcms_internal.h b/skcms_internal.h
index 4b0c395..551128a 100644
--- a/skcms_internal.h
+++ b/skcms_internal.h
@@ -37,6 +37,11 @@
static inline float fabsf_(float x) { return x < 0 ? -x : x; }
float powf_(float, float);
+// ~~~~ Does this pixel format need a palette pointer to be usable? ~~~~
+ static inline bool needs_palette(skcms_PixelFormat fmt) {
+ return (fmt >> 1) == (skcms_PixelFormat_RGBA_8888_Palette8 >> 1);
+ }
+
#ifdef __cplusplus
}
#endif
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index b3217c8..15262e2 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -348,52 +348,69 @@
return v;
}
-// Helper for gather_16(), loading the ix'th 16-bit value from p.
-SI uint16_t load_16(const uint8_t* p, int ix) {
- return load<uint16_t>(p + 2*ix);
-}
-
SI U16 gather_16(const uint8_t* p, I32 ix) {
+ // Load the i'th 16-bit value from p.
+ auto load_16 = [p](int i) {
+ return load<uint16_t>(p + 2*i);
+ };
#if N == 1
- U16 v = load_16(p,ix);
+ U16 v = load_16(ix);
#elif N == 4
- U16 v = { load_16(p,ix[0]), load_16(p,ix[1]), load_16(p,ix[2]), load_16(p,ix[3]) };
+ U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
#elif N == 8
- U16 v = { load_16(p,ix[0]), load_16(p,ix[1]), load_16(p,ix[2]), load_16(p,ix[3]),
- load_16(p,ix[4]), load_16(p,ix[5]), load_16(p,ix[6]), load_16(p,ix[7]) };
+ U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
+ load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
#elif N == 16
- U16 v = { load_16(p,ix[ 0]), load_16(p,ix[ 1]), load_16(p,ix[ 2]), load_16(p,ix[ 3]),
- load_16(p,ix[ 4]), load_16(p,ix[ 5]), load_16(p,ix[ 6]), load_16(p,ix[ 7]),
- load_16(p,ix[ 8]), load_16(p,ix[ 9]), load_16(p,ix[10]), load_16(p,ix[11]),
- load_16(p,ix[12]), load_16(p,ix[13]), load_16(p,ix[14]), load_16(p,ix[15]) };
+ U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
+ load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
+ load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
+ load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
#endif
return v;
}
-#if !defined(USING_AVX2)
- // Helpers for gather_24/48(), loading the ix'th 24/48-bit value from p, and 1/2 extra bytes.
- SI uint32_t load_24_32(const uint8_t* p, int ix) {
- return load<uint32_t>(p + 3*ix);
- }
- SI uint64_t load_48_64(const uint8_t* p, int ix) {
- return load<uint64_t>(p + 6*ix);
- }
+SI U32 gather_32(const uint8_t* p, I32 ix) {
+ // Load the i'th 32-bit value from p.
+ auto load_32 = [p](int i) {
+ return load<uint32_t>(p + 4*i);
+ };
+#if N == 1
+ U32 v = load_32(ix);
+#elif N == 4
+ U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
+#elif N == 8
+ U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
+ load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
+#elif N == 16
+ U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
+ load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
+ load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
+ load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
#endif
+ // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
+ return v;
+}
SI U32 gather_24(const uint8_t* p, I32 ix) {
// First, back up a byte. Any place we're gathering from has a safe junk byte to read
// in front of it, either a previous table value, or some tag metadata.
p -= 1;
+ // Load the i'th 24-bit value from p, and 1 extra byte.
+ auto load_24_32 = [p](int i) {
+ return load<uint32_t>(p + 3*i);
+ };
+
// Now load multiples of 4 bytes (a junk byte, then r,g,b).
#if N == 1
- U32 v = load_24_32(p,ix);
+ U32 v = load_24_32(ix);
#elif N == 4
- U32 v = { load_24_32(p,ix[0]), load_24_32(p,ix[1]), load_24_32(p,ix[2]), load_24_32(p,ix[3]) };
+ U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
#elif N == 8 && !defined(USING_AVX2)
- U32 v = { load_24_32(p,ix[0]), load_24_32(p,ix[1]), load_24_32(p,ix[2]), load_24_32(p,ix[3]),
- load_24_32(p,ix[4]), load_24_32(p,ix[5]), load_24_32(p,ix[6]), load_24_32(p,ix[7]) };
+ U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
+ load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
#elif N == 8
+ (void)load_24_32;
// The gather instruction here doesn't need any particular alignment,
// but the intrinsic takes a const int*.
const int* p4 = bit_pun<const int*>(p);
@@ -405,6 +422,7 @@
U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
#endif
#elif N == 16
+ (void)load_24_32;
// The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
// And AVX-512 swapped the order of arguments. :/
const int* p4 = bit_pun<const int*>(p);
@@ -420,18 +438,24 @@
// As in gather_24(), with everything doubled.
p -= 2;
+ // Load the i'th 48-bit value from p, and 2 extra bytes.
+ auto load_48_64 = [p](int i) {
+ return load<uint64_t>(p + 6*i);
+ };
+
#if N == 1
- *v = load_48_64(p,ix);
+ *v = load_48_64(ix);
#elif N == 4
*v = U64{
- load_48_64(p,ix[0]), load_48_64(p,ix[1]), load_48_64(p,ix[2]), load_48_64(p,ix[3]),
+ load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
};
#elif N == 8 && !defined(USING_AVX2)
*v = U64{
- load_48_64(p,ix[0]), load_48_64(p,ix[1]), load_48_64(p,ix[2]), load_48_64(p,ix[3]),
- load_48_64(p,ix[4]), load_48_64(p,ix[5]), load_48_64(p,ix[6]), load_48_64(p,ix[7]),
+ load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
+ load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
};
#elif N == 8
+ (void)load_48_64;
typedef int32_t __attribute__((vector_size(16))) Half_I32;
typedef long long __attribute__((vector_size(32))) Half_I64;
@@ -456,6 +480,7 @@
store((char*)v + 0, lo);
store((char*)v + 32, hi);
#elif N == 16
+ (void)load_48_64;
const long long int* p8 = bit_pun<const long long int*>(p);
__m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
@@ -676,6 +701,17 @@
a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
} break;
+ case Op_load_8888_palette8:{
+ const uint8_t* palette = (const uint8_t*) *args++;
+ I32 ix = cast<I32>(load<U8>(src + 1*i));
+ U32 rgba = gather_32(palette, ix);
+
+ r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
+ g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
+ b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+ a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+ } break;
+
case Op_load_1010102:{
U32 rgba = load<U32>(src + 4*i);
diff --git a/tests.c b/tests.c
index ccd737f..23a6792 100644
--- a/tests.c
+++ b/tests.c
@@ -1231,6 +1231,40 @@
&buf, skcms_PixelFormat_BGR_161616BE, upm, xyz, 1) );
}
+static void test_Palette8() {
+ uint32_t palette[256];
+ for (int i = 0; i < 256; i++) {
+ palette[i] = (uint32_t)(255 - i) * 0x01010101;
+ }
+
+ uint8_t src[512];
+ uint32_t dst[512];
+ for (int i = 0; i < 512; i++) {
+ src[i] = (uint8_t)(i % 256);
+ }
+
+ const skcms_ICCProfile* srgb = skcms_sRGB_profile();
+ const skcms_AlphaFormat upm = skcms_AlphaFormat_Unpremul;
+
+ expect( skcms_TransformWithPalette(src, skcms_PixelFormat_RGBA_8888_Palette8, upm, srgb,
+ dst, skcms_PixelFormat_RGBA_8888 , upm, srgb,
+ 512, palette) );
+
+ for (int i = 0; i < 512; i++) {
+ uint32_t expected = (uint32_t)(255 - i%256) * 0x01010101;
+ expect( dst[i] == expected );
+ }
+
+
+ // Double check we can't transform skcms_PixelFormat_RGBA_8888_Palette8 without a palette.
+ expect( !skcms_Transform(src, skcms_PixelFormat_RGBA_8888_Palette8, upm, srgb,
+ dst, skcms_PixelFormat_RGBA_8888 , upm, srgb,
+ 512) );
+ expect( !skcms_TransformWithPalette(src, skcms_PixelFormat_RGBA_8888_Palette8, upm, srgb,
+ dst, skcms_PixelFormat_RGBA_8888 , upm, srgb,
+ 512, NULL) );
+}
+
int main(int argc, char** argv) {
bool regenTestData = false;
for (int i = 1; i < argc; ++i) {
@@ -1265,6 +1299,7 @@
test_ExactlyEqual();
test_Clamp();
test_AliasedTransforms();
+ test_Palette8();
#if 0
test_CLUT();
#endif