fix up 16-bit formats for fp16

This is a mix of relaxing the test precision
and fixing the implementation (going via FP32).
I'm open to ideas for both sides of things.

Change-Id: I3b559bdb6857f6072e22ea06b536ef02640bb49e
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/215949
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index d115d90..799eda0 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -544,6 +544,11 @@
     return cast<F>(v) * (1/65535.0f);
 }
 
+SI U16 U16_from_F(F v) {
+    // 65535 == inf in FP16, so promote to FP32 before converting.
+    return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
+}
+
 SI F minus_1_ulp(F v) {
 #if defined(USING_NEON_FP16)
     return bit_pun<F>( bit_pun<U16>(v) - 1 );
@@ -1166,22 +1171,22 @@
                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
             #if defined(USING_NEON_FP16)
                 uint16x8x3_t v = {{
-                    (uint16x8_t)to_fixed(r * 65535),
-                    (uint16x8_t)to_fixed(g * 65535),
-                    (uint16x8_t)to_fixed(b * 65535),
+                    (uint16x8_t)U16_from_F(r),
+                    (uint16x8_t)U16_from_F(g),
+                    (uint16x8_t)U16_from_F(b),
                 }};
                 vst3q_u16(rgb, v);
             #elif defined(USING_NEON)
                 uint16x4x3_t v = {{
-                    (uint16x4_t)cast<U16>(to_fixed(r * 65535)),
-                    (uint16x4_t)cast<U16>(to_fixed(g * 65535)),
-                    (uint16x4_t)cast<U16>(to_fixed(b * 65535)),
+                    (uint16x4_t)U16_from_F(r),
+                    (uint16x4_t)U16_from_F(g),
+                    (uint16x4_t)U16_from_F(b),
                 }};
                 vst3_u16(rgb, v);
             #else
-                store_3(rgb+0, cast<U16>(to_fixed(r * 65535)));
-                store_3(rgb+1, cast<U16>(to_fixed(g * 65535)));
-                store_3(rgb+2, cast<U16>(to_fixed(b * 65535)));
+                store_3(rgb+0, U16_from_F(r));
+                store_3(rgb+1, U16_from_F(g));
+                store_3(rgb+2, U16_from_F(b));
             #endif
 
             } return;
@@ -1192,18 +1197,18 @@
                 uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
             #if defined(USING_NEON_FP16)
                 uint16x8x4_t v = {{
-                    (uint16x8_t)to_fixed(r * 65535),
-                    (uint16x8_t)to_fixed(g * 65535),
-                    (uint16x8_t)to_fixed(b * 65535),
-                    (uint16x8_t)to_fixed(a * 65535),
+                    (uint16x8_t)U16_from_F(r),
+                    (uint16x8_t)U16_from_F(g),
+                    (uint16x8_t)U16_from_F(b),
+                    (uint16x8_t)U16_from_F(a),
                 }};
                 vst4q_u16(rgba, v);
             #elif defined(USING_NEON)
                 uint16x4x4_t v = {{
-                    (uint16x4_t)cast<U16>(to_fixed(r * 65535)),
-                    (uint16x4_t)cast<U16>(to_fixed(g * 65535)),
-                    (uint16x4_t)cast<U16>(to_fixed(b * 65535)),
-                    (uint16x4_t)cast<U16>(to_fixed(a * 65535)),
+                    (uint16x4_t)U16_from_F(r),
+                    (uint16x4_t)U16_from_F(g),
+                    (uint16x4_t)U16_from_F(b),
+                    (uint16x4_t)U16_from_F(a),
                 }};
                 vst4_u16(rgba, v);
             #else
@@ -1221,16 +1226,16 @@
                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
             #if defined(USING_NEON_FP16)
                 uint16x8x3_t v = {{
-                    (uint16x8_t)swap_endian_16(to_fixed(r * 65535)),
-                    (uint16x8_t)swap_endian_16(to_fixed(g * 65535)),
-                    (uint16x8_t)swap_endian_16(to_fixed(b * 65535)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
                 }};
                 vst3q_u16(rgb, v);
             #elif defined(USING_NEON)
                 uint16x4x3_t v = {{
-                    (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(r * 65535))),
-                    (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(g * 65535))),
-                    (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(b * 65535))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
                 }};
                 vst3_u16(rgb, v);
             #else
@@ -1250,18 +1255,18 @@
                 uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
             #if defined(USING_NEON_FP16)
                 uint16x8x4_t v = {{
-                    (uint16x8_t)swap_endian_16(to_fixed(r * 65535)),
-                    (uint16x8_t)swap_endian_16(to_fixed(g * 65535)),
-                    (uint16x8_t)swap_endian_16(to_fixed(b * 65535)),
-                    (uint16x8_t)swap_endian_16(to_fixed(a * 65535)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(a)),
                 }};
                 vst4q_u16(rgba, v);
             #elif defined(USING_NEON)
                 uint16x4x4_t v = {{
-                    (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(r * 65535))),
-                    (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(g * 65535))),
-                    (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(b * 65535))),
-                    (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(a * 65535))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
                 }};
                 vst4_u16(rgba, v);
             #else
diff --git a/tests.c b/tests.c
index 005d166..4926e90 100644
--- a/tests.c
+++ b/tests.c
@@ -18,6 +18,12 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
+    static bool kFP16 = true;
+#else
+    static bool kFP16 = false;
+#endif
+
 #if defined(_MSC_VER)
     #define DEBUGBREAK __debugbreak
 #elif defined(__clang__)
@@ -35,6 +41,23 @@
         }                                                                             \
     } while(false)
 
+#define expect_close(x,y)                                                             \
+    do {                                                                              \
+        int X = (x),                                                                  \
+            Y = (y);                                                                  \
+        float ratio = (X < Y) ? (float)X / Y                                          \
+                    : (Y < X) ? (float)Y / X                                          \
+                    : 1.0f;                                                           \
+        if (ratio < (kFP16 ? 0.995f : 1.0f)) {                                        \
+            fprintf(stderr, "expect_close(" #x "==%d, " #y "==%d) failed at %s:%d\n", \
+                    X,Y, __FILE__,__LINE__);                                          \
+            fflush(stderr);   /* stderr is buffered on Windows. */                    \
+            DEBUGBREAK();                                                             \
+        }                                                                             \
+    } while(false)
+
+
+
 static void test_ICCProfile() {
     // Nothing works yet.  :)
     skcms_ICCProfile profile;
@@ -220,10 +243,10 @@
                            back, skcms_PixelFormat_RGBA_16161616LE,skcms_AlphaFormat_Unpremul, NULL,
                            16384));
     for (int i = 0; i < 16384; i++) {
-        expect( ((back[i] >>  0) & 0xffff) == ((dst[i] >>  0) & 0xff) * 0x0101);
-        expect( ((back[i] >> 16) & 0xffff) == ((dst[i] >>  8) & 0xff) * 0x0101);
-        expect( ((back[i] >> 32) & 0xffff) == ((dst[i] >> 16) & 0xff) * 0x0101);
-        expect( ((back[i] >> 48) & 0xffff) == ((dst[i] >> 24) & 0xff) * 0x0101);
+        expect_close( ((back[i] >>  0) & 0xffff) , ((dst[i] >>  0) & 0xff) * 0x0101);
+        expect_close( ((back[i] >> 16) & 0xffff) , ((dst[i] >>  8) & 0xff) * 0x0101);
+        expect_close( ((back[i] >> 32) & 0xffff) , ((dst[i] >> 16) & 0xff) * 0x0101);
+        expect_close( ((back[i] >> 48) & 0xffff) , ((dst[i] >> 24) & 0xff) * 0x0101);
     }
 
     free(src);
@@ -259,10 +282,15 @@
                             0x0101, 0x0101, 0xffff,
                             0xffff, 0xffff, 0xffff };
     for (int i = 0; i < 12; i++) {
-        expect(back[i] == expected[i]);
+        expect_close(back[i], expected[i]);
     }
 }
 
+static int bswap16(int x) {
+    return (x & 0x00ff) << 8
+         | (x & 0xff00) >> 8;
+}
+
 static void test_FormatConversions_16161616BE() {
     // We want to hit each 16-bit value, 4 per each of 16384 pixels.
     uint64_t* src = malloc(8 * 16384);
@@ -285,7 +313,7 @@
     // so the low lanes are actually the most significant byte, and the high least.
 
     expect(dst[    0] == 0x03020100);
-    expect(dst[ 8127] == 0xfefefdfc);  // 0x7eff rounds down to 0xfe, 0x7efe rounds up to 0xfe.
+    expect(dst[ 8127] == (kFP16 ? 0xfffefdfc : 0xfefefdfc));
     expect(dst[16383] == 0xfffefdfc);
 
     // We've lost precision when transforming to 8-bit, so these won't quite round-trip.
@@ -295,10 +323,10 @@
                            back, skcms_PixelFormat_RGBA_16161616BE,skcms_AlphaFormat_Unpremul, NULL,
                            16384));
     for (int i = 0; i < 16384; i++) {
-        expect( ((back[i] >>  0) & 0xffff) == ((dst[i] >>  0) & 0xff) * 0x0101);
-        expect( ((back[i] >> 16) & 0xffff) == ((dst[i] >>  8) & 0xff) * 0x0101);
-        expect( ((back[i] >> 32) & 0xffff) == ((dst[i] >> 16) & 0xff) * 0x0101);
-        expect( ((back[i] >> 48) & 0xffff) == ((dst[i] >> 24) & 0xff) * 0x0101);
+        expect_close(bswap16((back[i] >>  0) & 0xffff), ((dst[i] >>  0) & 0xff) * 0x0101);
+        expect_close(bswap16((back[i] >> 16) & 0xffff), ((dst[i] >>  8) & 0xff) * 0x0101);
+        expect_close(bswap16((back[i] >> 32) & 0xffff), ((dst[i] >> 16) & 0xff) * 0x0101);
+        expect_close(bswap16((back[i] >> 48) & 0xffff), ((dst[i] >> 24) & 0xff) * 0x0101);
     }
 
     free(src);
@@ -319,7 +347,7 @@
 
     expect(dst[0] == 0xff020100);
     expect(dst[1] == 0xfffdfc03);
-    expect(dst[2] == 0xfffcfefe);
+    expect(dst[2] == (kFP16 ? 0xfffcfffe : 0xfffcfefe));
     expect(dst[3] == 0xfffffefd);
 
     // We've lost precision when transforming to 8-bit, so these won't quite round-trip.
@@ -333,7 +361,7 @@
                             0xfefe, 0xfefe, 0xfcfc,
                             0xfdfd, 0xfefe, 0xffff };
     for (int i = 0; i < 12; i++) {
-        expect(back[i] == expected[i]);
+        expect_close(bswap16(back[i]), expected[i]);
     }
 }
 
@@ -1336,6 +1364,10 @@
     test_FormatConversions();
     test_FormatConversions_565();
     test_FormatConversions_101010();
+    test_FormatConversions_16161616LE();
+    test_FormatConversions_161616LE();
+    test_FormatConversions_16161616BE();
+    test_FormatConversions_161616BE();
     test_FormatConversions_half_norm();
     test_ApproximateCurve_clamped();
     test_Matrix3x3_invert();
@@ -1352,16 +1384,7 @@
     test_TF_invert();
 
     // Temporarily disable some tests while getting FP16 compute working.
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
-    bool skip = true;
-#else
-    bool skip = false;
-#endif
-    if (!skip) {
-        test_FormatConversions_16161616LE();
-        test_FormatConversions_161616LE();
-        test_FormatConversions_16161616BE();
-        test_FormatConversions_161616BE();
+    if (!kFP16) {
         test_FormatConversions_half();
         test_FormatConversions_float();
         test_Parse(regenTestData);