Remove unfinished ARM F16 compute path

This was a distraction, and there were no plans to finish the
implementation.

Change-Id: I29046c2c4bbf5b402d47112753515416ce8c83cc
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/698183
Reviewed-by: Kevin Lubick <kjlubick@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
diff --git a/build.ninja b/build.ninja
index faed2c6..11f1e47 100644
--- a/build.ninja
+++ b/build.ninja
@@ -25,7 +25,6 @@
 subninja ninja/gcc.xsan
 
 subninja ninja/android
-subninja ninja/android.fp16
 subninja ninja/android.nofp16
 subninja ninja/android.lsan
 subninja ninja/android.portable
diff --git a/ninja/android.fp16 b/ninja/android.fp16
deleted file mode 100644
index 058a54a..0000000
--- a/ninja/android.fp16
+++ /dev/null
@@ -1,3 +0,0 @@
-mode         = .fp16
-extra_cflags = -march=armv8.2a+fp16 -DSKCMS_OPT_INTO_NEON_FP16 -Wno-implicit-float-conversion
-include ninja/android
diff --git a/skcms.cc b/skcms.cc
index 246c08a..3940c0a 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -2403,10 +2403,6 @@
     #define N 8
     template <typename T> using V = Vec<N,T>;
     using Color = float;
-#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
-    #define N 8
-    template <typename T> using V = Vec<N,T>;
-    using Color = _Float16;
 #else
     #define N 4
     template <typename T> using V = Vec<N,T>;
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 350f6a2..17324b2 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -54,9 +54,6 @@
     #if __ARM_FP & 2
         #define USING_NEON_F16C
     #endif
-    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
-        #define USING_NEON_FP16
-    #endif
 #endif
 
 // These -Wvector-conversion warnings seem to trigger in very bogus situations,
@@ -132,12 +129,7 @@
 // When we convert from float to fixed point, it's very common to want to round,
 // and for some reason compilers generate better code when converting to int32_t.
 // To serve both those ends, we use this function to_fixed() instead of direct cast().
-#if defined(USING_NEON_FP16)
-    // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
-    SI U16 to_fixed(F f) {  return cast<U16>(f + 0.5f); }
-#else
-    SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); }
-#endif
+SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); }
 
 
 // Sometimes we do something crazy on one branch of a conditonal,
@@ -158,9 +150,7 @@
 
 
 SI F F_from_Half(U16 half) {
-#if defined(USING_NEON_FP16)
-    return bit_pun<F>(half);
-#elif defined(USING_NEON_F16C)
+#if defined(USING_NEON_F16C)
     return vcvt_f32_f16((float16x4_t)half);
 #elif defined(USING_AVX512F)
     return (F)_mm512_cvtph_ps((__m256i)half);
@@ -187,9 +177,7 @@
     __attribute__((no_sanitize("unsigned-integer-overflow")))
 #endif
 SI U16 Half_from_F(F f) {
-#if defined(USING_NEON_FP16)
-    return bit_pun<U16>(f);
-#elif defined(USING_NEON_F16C)
+#if defined(USING_NEON_F16C)
     return (U16)vcvt_f16_f32(f);
 #elif defined(USING_AVX512F)
     return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
@@ -208,11 +196,7 @@
 }
 
 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
-#if defined(USING_NEON_FP16)
-    SI U16 swap_endian_16(U16 v) {
-        return (U16)vrev16q_u8((uint8x16_t) v);
-    }
-#elif defined(USING_NEON)
+#if defined(USING_NEON)
     SI U16 swap_endian_16(U16 v) {
         return (U16)vrev16_u8((uint8x8_t) v);
     }
@@ -223,10 +207,7 @@
          | (rgba & 0xff00ff00ff00ff00) >> 8;
 }
 
-#if defined(USING_NEON_FP16)
-    SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
-    SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
-#elif defined(USING_NEON)
+#if defined(USING_NEON)
     SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
     SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
 #else
@@ -237,8 +218,6 @@
 SI F floor_(F x) {
 #if N == 1
     return floorf_(x);
-#elif defined(USING_NEON_FP16)
-    return vrndmq_f16(x);
 #elif defined(__aarch64__)
     return vrndmq_f32(x);
 #elif defined(USING_AVX512F)
@@ -263,10 +242,6 @@
 }
 
 SI F approx_log2(F x) {
-#if defined(USING_NEON_FP16)
-    // TODO(mtklein)
-    return x;
-#else
     // The first approximation of log2(x) is its exponent 'e', minus 127.
     I32 bits = bit_pun<I32>(x);
 
@@ -278,7 +253,6 @@
     return e - 124.225514990f
              -   1.498030302f*m
              -   1.725879990f/(0.3520887068f + m);
-#endif
 }
 
 SI F approx_log(F x) {
@@ -287,10 +261,6 @@
 }
 
 SI F approx_exp2(F x) {
-#if defined(USING_NEON_FP16)
-    // TODO(mtklein)
-    return x;
-#else
     F fract = x - floor_(x);
 
     F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
@@ -299,7 +269,6 @@
     I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
 
     return bit_pun<F>(bits);
-#endif
 }
 
 SI F approx_pow(F x, float y) {
@@ -314,11 +283,6 @@
 
 // Return tf(x).
 SI F apply_tf(const skcms_TransferFunction* tf, F x) {
-#if defined(USING_NEON_FP16)
-    // TODO(mtklein)
-    (void)tf;
-    return x;
-#else
     // Peel off the sign bit and set x = |x|.
     U32 bits = bit_pun<U32>(x),
         sign = bits & 0x80000000;
@@ -330,15 +294,9 @@
 
     // Tack the sign bit back on.
     return bit_pun<F>(sign | bit_pun<U32>(v));
-#endif
 }
 
 SI F apply_pq(const skcms_TransferFunction* tf, F x) {
-#if defined(USING_NEON_FP16)
-    // TODO(mtklein)
-    (void)tf;
-    return x;
-#else
     U32 bits = bit_pun<U32>(x),
         sign = bits & 0x80000000;
     x = bit_pun<F>(bits ^ sign);
@@ -348,15 +306,9 @@
                      tf->f);
 
     return bit_pun<F>(sign | bit_pun<U32>(v));
-#endif
 }
 
 SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
-#if defined(USING_NEON_FP16)
-    // TODO(mtklein)
-    (void)tf;
-    return x;
-#else
     const float R = tf->a, G = tf->b,
                 a = tf->c, b = tf->d, c = tf->e,
                 K = tf->f + 1;
@@ -368,15 +320,9 @@
                                , approx_exp((x-c)*a) + b);
 
     return K*bit_pun<F>(sign | bit_pun<U32>(v));
-#endif
 }
 
 SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
-#if defined(USING_NEON_FP16)
-    // TODO(mtklein)
-    (void)tf;
-    return x;
-#else
     const float R = tf->a, G = tf->b,
                 a = tf->c, b = tf->d, c = tf->e,
                 K = tf->f + 1;
@@ -389,7 +335,6 @@
                              , a * approx_log(x - b) + c);
 
     return bit_pun<F>(sign | bit_pun<U32>(v));
-#endif
 }
 
 
@@ -636,11 +581,7 @@
 }
 
 SI F minus_1_ulp(F v) {
-#if defined(USING_NEON_FP16)
-    return bit_pun<F>( bit_pun<U16>(v) - 1 );
-#else
     return bit_pun<F>( bit_pun<U32>(v) - 1 );
-#endif
 }
 
 SI F table(const skcms_Curve* curve, F v) {
@@ -835,23 +776,7 @@
 
             case Op_load_888:{
                 const uint8_t* rgb = (const uint8_t*)(src + 3*i);
-            #if defined(USING_NEON_FP16)
-                // See the explanation under USING_NEON below.  This is that doubled up.
-                uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
-                v = vld3q_lane_u8(rgb+ 0, v,  0);
-                v = vld3q_lane_u8(rgb+ 3, v,  2);
-                v = vld3q_lane_u8(rgb+ 6, v,  4);
-                v = vld3q_lane_u8(rgb+ 9, v,  6);
-
-                v = vld3q_lane_u8(rgb+12, v,  8);
-                v = vld3q_lane_u8(rgb+15, v, 10);
-                v = vld3q_lane_u8(rgb+18, v, 12);
-                v = vld3q_lane_u8(rgb+21, v, 14);
-
-                r = cast<F>((U16)v.val[0]) * (1/255.0f);
-                g = cast<F>((U16)v.val[1]) * (1/255.0f);
-                b = cast<F>((U16)v.val[2]) * (1/255.0f);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
                 // a time.  Since we're doing that, we might as well load them into 16-bit lanes.
                 // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
@@ -917,12 +842,7 @@
                 uintptr_t ptr = (uintptr_t)(src + 6*i);
                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x3_t v = vld3q_u16(rgb);
-                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
-                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
-                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x3_t v = vld3_u16(rgb);
                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
@@ -938,13 +858,7 @@
                 uintptr_t ptr = (uintptr_t)(src + 8*i);
                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x4_t v = vld4q_u16(rgba);
-                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
-                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
-                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
-                a = cast<F>((U16)v.val[3]) * (1/65535.0f);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x4_t v = vld4_u16(rgba);
                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
@@ -964,12 +878,7 @@
                 uintptr_t ptr = (uintptr_t)(src + 6*i);
                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x3_t v = vld3q_u16(rgb);
-                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
-                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
-                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x3_t v = vld3_u16(rgb);
                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
@@ -989,13 +898,7 @@
                 uintptr_t ptr = (uintptr_t)(src + 8*i);
                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x4_t v = vld4q_u16(rgba);
-                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
-                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
-                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
-                a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x4_t v = vld4_u16(rgba);
                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
@@ -1015,12 +918,7 @@
                 uintptr_t ptr = (uintptr_t)(src + 6*i);
                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x3_t v = vld3q_u16(rgb);
-                U16 R = (U16)v.val[0],
-                    G = (U16)v.val[1],
-                    B = (U16)v.val[2];
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x3_t v = vld3_u16(rgb);
                 U16 R = (U16)v.val[0],
                     G = (U16)v.val[1],
@@ -1039,13 +937,7 @@
                 uintptr_t ptr = (uintptr_t)(src + 8*i);
                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x4_t v = vld4q_u16(rgba);
-                U16 R = (U16)v.val[0],
-                    G = (U16)v.val[1],
-                    B = (U16)v.val[2],
-                    A = (U16)v.val[3];
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x4_t v = vld4_u16(rgba);
                 U16 R = (U16)v.val[0],
                     G = (U16)v.val[1],
@@ -1068,13 +960,7 @@
                 uintptr_t ptr = (uintptr_t)(src + 12*i);
                 assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
                 const float* rgb = (const float*)ptr;       // cast to const float* to be safe.
-            #if defined(USING_NEON_FP16)
-                float32x4x3_t lo = vld3q_f32(rgb +  0),
-                              hi = vld3q_f32(rgb + 12);
-                r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
-                g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
-                b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 float32x4x3_t v = vld3q_f32(rgb);
                 r = (F)v.val[0];
                 g = (F)v.val[1];
@@ -1090,14 +976,7 @@
                 uintptr_t ptr = (uintptr_t)(src + 16*i);
                 assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
                 const float* rgba = (const float*)ptr;      // cast to const float* to be safe.
-            #if defined(USING_NEON_FP16)
-                float32x4x4_t lo = vld4q_f32(rgba +  0),
-                              hi = vld4q_f32(rgba + 16);
-                r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
-                g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
-                b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
-                a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 float32x4x4_t v = vld4q_f32(rgba);
                 r = (F)v.val[0];
                 g = (F)v.val[1];
@@ -1280,23 +1159,7 @@
 
             case Op_store_888: {
                 uint8_t* rgb = (uint8_t*)dst + 3*i;
-            #if defined(USING_NEON_FP16)
-                // See the explanation under USING_NEON below.  This is that doubled up.
-                U16 R = to_fixed(r * 255),
-                    G = to_fixed(g * 255),
-                    B = to_fixed(b * 255);
-
-                uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
-                vst3q_lane_u8(rgb+ 0, v,  0);
-                vst3q_lane_u8(rgb+ 3, v,  2);
-                vst3q_lane_u8(rgb+ 6, v,  4);
-                vst3q_lane_u8(rgb+ 9, v,  6);
-
-                vst3q_lane_u8(rgb+12, v,  8);
-                vst3q_lane_u8(rgb+15, v, 10);
-                vst3q_lane_u8(rgb+18, v, 12);
-                vst3q_lane_u8(rgb+21, v, 14);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
                 // get there via U16 to save some instructions converting to float.  And just
                 // like load_888, we'd prefer to go via U32 but for ARMv7 support.
@@ -1343,14 +1206,7 @@
                 uintptr_t ptr = (uintptr_t)(dst + 6*i);
                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x3_t v = {{
-                    (uint16x8_t)U16_from_F(r),
-                    (uint16x8_t)U16_from_F(g),
-                    (uint16x8_t)U16_from_F(b),
-                }};
-                vst3q_u16(rgb, v);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x3_t v = {{
                     (uint16x4_t)U16_from_F(r),
                     (uint16x4_t)U16_from_F(g),
@@ -1369,15 +1225,7 @@
                 uintptr_t ptr = (uintptr_t)(dst + 8*i);
                 assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
                 uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x4_t v = {{
-                    (uint16x8_t)U16_from_F(r),
-                    (uint16x8_t)U16_from_F(g),
-                    (uint16x8_t)U16_from_F(b),
-                    (uint16x8_t)U16_from_F(a),
-                }};
-                vst4q_u16(rgba, v);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x4_t v = {{
                     (uint16x4_t)U16_from_F(r),
                     (uint16x4_t)U16_from_F(g),
@@ -1398,14 +1246,7 @@
                 uintptr_t ptr = (uintptr_t)(dst + 6*i);
                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x3_t v = {{
-                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
-                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
-                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
-                }};
-                vst3q_u16(rgb, v);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x3_t v = {{
                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
@@ -1427,15 +1268,7 @@
                 uintptr_t ptr = (uintptr_t)(dst + 8*i);
                 assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
                 uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
-            #if defined(USING_NEON_FP16)
-                uint16x8x4_t v = {{
-                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
-                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
-                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
-                    (uint16x8_t)swap_endian_16(U16_from_F(a)),
-                }};
-                vst4q_u16(rgba, v);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x4_t v = {{
                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
@@ -1460,14 +1293,7 @@
                 U16 R = Half_from_F(r),
                     G = Half_from_F(g),
                     B = Half_from_F(b);
-            #if defined(USING_NEON_FP16)
-                uint16x8x3_t v = {{
-                    (uint16x8_t)R,
-                    (uint16x8_t)G,
-                    (uint16x8_t)B,
-                }};
-                vst3q_u16(rgb, v);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x3_t v = {{
                     (uint16x4_t)R,
                     (uint16x4_t)G,
@@ -1490,15 +1316,7 @@
                     G = Half_from_F(g),
                     B = Half_from_F(b),
                     A = Half_from_F(a);
-            #if defined(USING_NEON_FP16)
-                uint16x8x4_t v = {{
-                    (uint16x8_t)R,
-                    (uint16x8_t)G,
-                    (uint16x8_t)B,
-                    (uint16x8_t)A,
-                }};
-                vst4q_u16(rgba, v);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 uint16x4x4_t v = {{
                     (uint16x4_t)R,
                     (uint16x4_t)G,
@@ -1519,19 +1337,7 @@
                 uintptr_t ptr = (uintptr_t)(dst + 12*i);
                 assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
                 float* rgb = (float*)ptr;                // for this cast to float* to be safe.
-            #if defined(USING_NEON_FP16)
-                float32x4x3_t lo = {{
-                    vcvt_f32_f16(vget_low_f16(r)),
-                    vcvt_f32_f16(vget_low_f16(g)),
-                    vcvt_f32_f16(vget_low_f16(b)),
-                }}, hi = {{
-                    vcvt_f32_f16(vget_high_f16(r)),
-                    vcvt_f32_f16(vget_high_f16(g)),
-                    vcvt_f32_f16(vget_high_f16(b)),
-                }};
-                vst3q_f32(rgb +  0, lo);
-                vst3q_f32(rgb + 12, hi);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 float32x4x3_t v = {{
                     (float32x4_t)r,
                     (float32x4_t)g,
@@ -1549,21 +1355,7 @@
                 uintptr_t ptr = (uintptr_t)(dst + 16*i);
                 assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
                 float* rgba = (float*)ptr;               // for this cast to float* to be safe.
-            #if defined(USING_NEON_FP16)
-                float32x4x4_t lo = {{
-                    vcvt_f32_f16(vget_low_f16(r)),
-                    vcvt_f32_f16(vget_low_f16(g)),
-                    vcvt_f32_f16(vget_low_f16(b)),
-                    vcvt_f32_f16(vget_low_f16(a)),
-                }}, hi = {{
-                    vcvt_f32_f16(vget_high_f16(r)),
-                    vcvt_f32_f16(vget_high_f16(g)),
-                    vcvt_f32_f16(vget_high_f16(b)),
-                    vcvt_f32_f16(vget_high_f16(a)),
-                }};
-                vst4q_f32(rgba +  0, lo);
-                vst4q_f32(rgba + 16, hi);
-            #elif defined(USING_NEON)
+            #if defined(USING_NEON)
                 float32x4x4_t v = {{
                     (float32x4_t)r,
                     (float32x4_t)g,
@@ -1621,8 +1413,5 @@
 #if defined(USING_NEON_F16C)
     #undef  USING_NEON_F16C
 #endif
-#if defined(USING_NEON_FP16)
-    #undef  USING_NEON_FP16
-#endif
 
 #undef FALLTHROUGH
diff --git a/tests.c b/tests.c
index c4ca414..04a7a28 100644
--- a/tests.c
+++ b/tests.c
@@ -18,12 +18,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
-    static bool kFP16 = true;
-#else
-    static bool kFP16 = false;
-#endif
-
 #if defined(_MSC_VER)
     #define DEBUGBREAK __debugbreak
 #elif defined(__clang__)
@@ -53,7 +47,7 @@
             double ratio = (X < Y) ? X / Y                                                \
                          : (Y < X) ? Y / X                                                \
                          : 1.0;                                                           \
-            if (ratio < (kFP16 ? 0.995 : 1.0)) {                                          \
+            if (ratio < 1.0) {                                                            \
                 fprintf(stderr, "expect_close(" #x "==%g, " #y "==%g) failed at %s:%d\n", \
                         X,Y, __FILE__,__LINE__);                                          \
                 fflush(stderr);   /* stderr is buffered on Windows. */                    \
@@ -319,7 +313,7 @@
     // so the low lanes are actually the most significant byte, and the high least.
 
     expect(dst[    0] == 0x03020100);
-    expect(dst[ 8127] == (kFP16 ? 0xfffefdfc : 0xfefefdfc));
+    expect(dst[ 8127] == 0xfefefdfc);
     expect(dst[16383] == 0xfffefdfc);
 
     // We've lost precision when transforming to 8-bit, so these won't quite round-trip.
@@ -353,7 +347,7 @@
 
     expect(dst[0] == 0xff020100);
     expect(dst[1] == 0xfffdfc03);
-    expect(dst[2] == (kFP16 ? 0xfffcfffe : 0xfffcfefe));
+    expect(dst[2] == 0xfffcfefe);
     expect(dst[3] == 0xfffffefd);
 
     // We've lost precision when transforming to 8-bit, so these won't quite round-trip.
@@ -1925,12 +1919,10 @@
     test_ParseWithA2BPriority();
     test_B2A();
 
-    // Temporarily disable some tests while getting FP16 compute working.
-    if (!kFP16) {
-        test_Parse(regenTestData);
-        test_sRGB_AllBytes();
-        test_TRC_Table16();
-    }
+    test_Parse(regenTestData);
+    test_sRGB_AllBytes();
+    test_TRC_Table16();
+
 #if 0
     test_CLUT();
 #endif