fix up 16-bit formats for fp16
This is a mix of relaxing the test precision
and fixing the implementation (going via FP32).
I'm open to ideas for both sides of things.
Change-Id: I3b559bdb6857f6072e22ea06b536ef02640bb49e
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/215949
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index d115d90..799eda0 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -544,6 +544,11 @@
return cast<F>(v) * (1/65535.0f);
}
+SI U16 U16_from_F(F v) {
+ // 65535 == inf in FP16, so promote to FP32 before converting.
+ return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
+}
+
SI F minus_1_ulp(F v) {
#if defined(USING_NEON_FP16)
return bit_pun<F>( bit_pun<U16>(v) - 1 );
@@ -1166,22 +1171,22 @@
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
#if defined(USING_NEON_FP16)
uint16x8x3_t v = {{
- (uint16x8_t)to_fixed(r * 65535),
- (uint16x8_t)to_fixed(g * 65535),
- (uint16x8_t)to_fixed(b * 65535),
+ (uint16x8_t)U16_from_F(r),
+ (uint16x8_t)U16_from_F(g),
+ (uint16x8_t)U16_from_F(b),
}};
vst3q_u16(rgb, v);
#elif defined(USING_NEON)
uint16x4x3_t v = {{
- (uint16x4_t)cast<U16>(to_fixed(r * 65535)),
- (uint16x4_t)cast<U16>(to_fixed(g * 65535)),
- (uint16x4_t)cast<U16>(to_fixed(b * 65535)),
+ (uint16x4_t)U16_from_F(r),
+ (uint16x4_t)U16_from_F(g),
+ (uint16x4_t)U16_from_F(b),
}};
vst3_u16(rgb, v);
#else
- store_3(rgb+0, cast<U16>(to_fixed(r * 65535)));
- store_3(rgb+1, cast<U16>(to_fixed(g * 65535)));
- store_3(rgb+2, cast<U16>(to_fixed(b * 65535)));
+ store_3(rgb+0, U16_from_F(r));
+ store_3(rgb+1, U16_from_F(g));
+ store_3(rgb+2, U16_from_F(b));
#endif
} return;
@@ -1192,18 +1197,18 @@
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
#if defined(USING_NEON_FP16)
uint16x8x4_t v = {{
- (uint16x8_t)to_fixed(r * 65535),
- (uint16x8_t)to_fixed(g * 65535),
- (uint16x8_t)to_fixed(b * 65535),
- (uint16x8_t)to_fixed(a * 65535),
+ (uint16x8_t)U16_from_F(r),
+ (uint16x8_t)U16_from_F(g),
+ (uint16x8_t)U16_from_F(b),
+ (uint16x8_t)U16_from_F(a),
}};
vst4q_u16(rgba, v);
#elif defined(USING_NEON)
uint16x4x4_t v = {{
- (uint16x4_t)cast<U16>(to_fixed(r * 65535)),
- (uint16x4_t)cast<U16>(to_fixed(g * 65535)),
- (uint16x4_t)cast<U16>(to_fixed(b * 65535)),
- (uint16x4_t)cast<U16>(to_fixed(a * 65535)),
+ (uint16x4_t)U16_from_F(r),
+ (uint16x4_t)U16_from_F(g),
+ (uint16x4_t)U16_from_F(b),
+ (uint16x4_t)U16_from_F(a),
}};
vst4_u16(rgba, v);
#else
@@ -1221,16 +1226,16 @@
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
#if defined(USING_NEON_FP16)
uint16x8x3_t v = {{
- (uint16x8_t)swap_endian_16(to_fixed(r * 65535)),
- (uint16x8_t)swap_endian_16(to_fixed(g * 65535)),
- (uint16x8_t)swap_endian_16(to_fixed(b * 65535)),
+ (uint16x8_t)swap_endian_16(U16_from_F(r)),
+ (uint16x8_t)swap_endian_16(U16_from_F(g)),
+ (uint16x8_t)swap_endian_16(U16_from_F(b)),
}};
vst3q_u16(rgb, v);
#elif defined(USING_NEON)
uint16x4x3_t v = {{
- (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(r * 65535))),
- (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(g * 65535))),
- (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(b * 65535))),
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
}};
vst3_u16(rgb, v);
#else
@@ -1250,18 +1255,18 @@
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
#if defined(USING_NEON_FP16)
uint16x8x4_t v = {{
- (uint16x8_t)swap_endian_16(to_fixed(r * 65535)),
- (uint16x8_t)swap_endian_16(to_fixed(g * 65535)),
- (uint16x8_t)swap_endian_16(to_fixed(b * 65535)),
- (uint16x8_t)swap_endian_16(to_fixed(a * 65535)),
+ (uint16x8_t)swap_endian_16(U16_from_F(r)),
+ (uint16x8_t)swap_endian_16(U16_from_F(g)),
+ (uint16x8_t)swap_endian_16(U16_from_F(b)),
+ (uint16x8_t)swap_endian_16(U16_from_F(a)),
}};
vst4q_u16(rgba, v);
#elif defined(USING_NEON)
uint16x4x4_t v = {{
- (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(r * 65535))),
- (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(g * 65535))),
- (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(b * 65535))),
- (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(a * 65535))),
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
}};
vst4_u16(rgba, v);
#else
diff --git a/tests.c b/tests.c
index 005d166..4926e90 100644
--- a/tests.c
+++ b/tests.c
@@ -18,6 +18,12 @@
#include <stdlib.h>
#include <string.h>
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
+ static bool kFP16 = true;
+#else
+ static bool kFP16 = false;
+#endif
+
#if defined(_MSC_VER)
#define DEBUGBREAK __debugbreak
#elif defined(__clang__)
@@ -35,6 +41,23 @@
} \
} while(false)
+#define expect_close(x,y) \
+ do { \
+ int X = (x), \
+ Y = (y); \
+ float ratio = (X < Y) ? (float)X / Y \
+ : (Y < X) ? (float)Y / X \
+ : 1.0f; \
+ if (ratio < (kFP16 ? 0.995f : 1.0f)) { \
+ fprintf(stderr, "expect_close(" #x "==%d, " #y "==%d) failed at %s:%d\n", \
+ X,Y, __FILE__,__LINE__); \
+ fflush(stderr); /* stderr is buffered on Windows. */ \
+ DEBUGBREAK(); \
+ } \
+ } while(false)
+
+
+
static void test_ICCProfile() {
// Nothing works yet. :)
skcms_ICCProfile profile;
@@ -220,10 +243,10 @@
back, skcms_PixelFormat_RGBA_16161616LE,skcms_AlphaFormat_Unpremul, NULL,
16384));
for (int i = 0; i < 16384; i++) {
- expect( ((back[i] >> 0) & 0xffff) == ((dst[i] >> 0) & 0xff) * 0x0101);
- expect( ((back[i] >> 16) & 0xffff) == ((dst[i] >> 8) & 0xff) * 0x0101);
- expect( ((back[i] >> 32) & 0xffff) == ((dst[i] >> 16) & 0xff) * 0x0101);
- expect( ((back[i] >> 48) & 0xffff) == ((dst[i] >> 24) & 0xff) * 0x0101);
+ expect_close( ((back[i] >> 0) & 0xffff) , ((dst[i] >> 0) & 0xff) * 0x0101);
+ expect_close( ((back[i] >> 16) & 0xffff) , ((dst[i] >> 8) & 0xff) * 0x0101);
+ expect_close( ((back[i] >> 32) & 0xffff) , ((dst[i] >> 16) & 0xff) * 0x0101);
+ expect_close( ((back[i] >> 48) & 0xffff) , ((dst[i] >> 24) & 0xff) * 0x0101);
}
free(src);
@@ -259,10 +282,15 @@
0x0101, 0x0101, 0xffff,
0xffff, 0xffff, 0xffff };
for (int i = 0; i < 12; i++) {
- expect(back[i] == expected[i]);
+ expect_close(back[i], expected[i]);
}
}
+static int bswap16(int x) {
+ return (x & 0x00ff) << 8
+ | (x & 0xff00) >> 8;
+}
+
static void test_FormatConversions_16161616BE() {
// We want to hit each 16-bit value, 4 per each of 16384 pixels.
uint64_t* src = malloc(8 * 16384);
@@ -285,7 +313,7 @@
// so the low lanes are actually the most significant byte, and the high least.
expect(dst[ 0] == 0x03020100);
- expect(dst[ 8127] == 0xfefefdfc); // 0x7eff rounds down to 0xfe, 0x7efe rounds up to 0xfe.
+ expect(dst[ 8127] == (kFP16 ? 0xfffefdfc : 0xfefefdfc));
expect(dst[16383] == 0xfffefdfc);
// We've lost precision when transforming to 8-bit, so these won't quite round-trip.
@@ -295,10 +323,10 @@
back, skcms_PixelFormat_RGBA_16161616BE,skcms_AlphaFormat_Unpremul, NULL,
16384));
for (int i = 0; i < 16384; i++) {
- expect( ((back[i] >> 0) & 0xffff) == ((dst[i] >> 0) & 0xff) * 0x0101);
- expect( ((back[i] >> 16) & 0xffff) == ((dst[i] >> 8) & 0xff) * 0x0101);
- expect( ((back[i] >> 32) & 0xffff) == ((dst[i] >> 16) & 0xff) * 0x0101);
- expect( ((back[i] >> 48) & 0xffff) == ((dst[i] >> 24) & 0xff) * 0x0101);
+ expect_close(bswap16((back[i] >> 0) & 0xffff), ((dst[i] >> 0) & 0xff) * 0x0101);
+ expect_close(bswap16((back[i] >> 16) & 0xffff), ((dst[i] >> 8) & 0xff) * 0x0101);
+ expect_close(bswap16((back[i] >> 32) & 0xffff), ((dst[i] >> 16) & 0xff) * 0x0101);
+ expect_close(bswap16((back[i] >> 48) & 0xffff), ((dst[i] >> 24) & 0xff) * 0x0101);
}
free(src);
@@ -319,7 +347,7 @@
expect(dst[0] == 0xff020100);
expect(dst[1] == 0xfffdfc03);
- expect(dst[2] == 0xfffcfefe);
+ expect(dst[2] == (kFP16 ? 0xfffcfffe : 0xfffcfefe));
expect(dst[3] == 0xfffffefd);
// We've lost precision when transforming to 8-bit, so these won't quite round-trip.
@@ -333,7 +361,7 @@
0xfefe, 0xfefe, 0xfcfc,
0xfdfd, 0xfefe, 0xffff };
for (int i = 0; i < 12; i++) {
- expect(back[i] == expected[i]);
+ expect_close(bswap16(back[i]), expected[i]);
}
}
@@ -1336,6 +1364,10 @@
test_FormatConversions();
test_FormatConversions_565();
test_FormatConversions_101010();
+ test_FormatConversions_16161616LE();
+ test_FormatConversions_161616LE();
+ test_FormatConversions_16161616BE();
+ test_FormatConversions_161616BE();
test_FormatConversions_half_norm();
test_ApproximateCurve_clamped();
test_Matrix3x3_invert();
@@ -1352,16 +1384,7 @@
test_TF_invert();
// Temporarily disable some tests while getting FP16 compute working.
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
- bool skip = true;
-#else
- bool skip = false;
-#endif
- if (!skip) {
- test_FormatConversions_16161616LE();
- test_FormatConversions_161616LE();
- test_FormatConversions_16161616BE();
- test_FormatConversions_161616BE();
+ if (!kFP16) {
test_FormatConversions_half();
test_FormatConversions_float();
test_Parse(regenTestData);