Remove unfinished ARM F16 compute path
This was a distraction, and there were no plans to finish the
implementation.
Change-Id: I29046c2c4bbf5b402d47112753515416ce8c83cc
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/698183
Reviewed-by: Kevin Lubick <kjlubick@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
diff --git a/build.ninja b/build.ninja
index faed2c6..11f1e47 100644
--- a/build.ninja
+++ b/build.ninja
@@ -25,7 +25,6 @@
subninja ninja/gcc.xsan
subninja ninja/android
-subninja ninja/android.fp16
subninja ninja/android.nofp16
subninja ninja/android.lsan
subninja ninja/android.portable
diff --git a/ninja/android.fp16 b/ninja/android.fp16
deleted file mode 100644
index 058a54a..0000000
--- a/ninja/android.fp16
+++ /dev/null
@@ -1,3 +0,0 @@
-mode = .fp16
-extra_cflags = -march=armv8.2a+fp16 -DSKCMS_OPT_INTO_NEON_FP16 -Wno-implicit-float-conversion
-include ninja/android
diff --git a/skcms.cc b/skcms.cc
index 246c08a..3940c0a 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -2403,10 +2403,6 @@
#define N 8
template <typename T> using V = Vec<N,T>;
using Color = float;
-#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
- #define N 8
- template <typename T> using V = Vec<N,T>;
- using Color = _Float16;
#else
#define N 4
template <typename T> using V = Vec<N,T>;
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 350f6a2..17324b2 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -54,9 +54,6 @@
#if __ARM_FP & 2
#define USING_NEON_F16C
#endif
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
- #define USING_NEON_FP16
- #endif
#endif
// These -Wvector-conversion warnings seem to trigger in very bogus situations,
@@ -132,12 +129,7 @@
// When we convert from float to fixed point, it's very common to want to round,
// and for some reason compilers generate better code when converting to int32_t.
// To serve both those ends, we use this function to_fixed() instead of direct cast().
-#if defined(USING_NEON_FP16)
- // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
- SI U16 to_fixed(F f) { return cast<U16>(f + 0.5f); }
-#else
- SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
-#endif
+SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
// Sometimes we do something crazy on one branch of a conditonal,
@@ -158,9 +150,7 @@
SI F F_from_Half(U16 half) {
-#if defined(USING_NEON_FP16)
- return bit_pun<F>(half);
-#elif defined(USING_NEON_F16C)
+#if defined(USING_NEON_F16C)
return vcvt_f32_f16((float16x4_t)half);
#elif defined(USING_AVX512F)
return (F)_mm512_cvtph_ps((__m256i)half);
@@ -187,9 +177,7 @@
__attribute__((no_sanitize("unsigned-integer-overflow")))
#endif
SI U16 Half_from_F(F f) {
-#if defined(USING_NEON_FP16)
- return bit_pun<U16>(f);
-#elif defined(USING_NEON_F16C)
+#if defined(USING_NEON_F16C)
return (U16)vcvt_f16_f32(f);
#elif defined(USING_AVX512F)
return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
@@ -208,11 +196,7 @@
}
// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
-#if defined(USING_NEON_FP16)
- SI U16 swap_endian_16(U16 v) {
- return (U16)vrev16q_u8((uint8x16_t) v);
- }
-#elif defined(USING_NEON)
+#if defined(USING_NEON)
SI U16 swap_endian_16(U16 v) {
return (U16)vrev16_u8((uint8x8_t) v);
}
@@ -223,10 +207,7 @@
| (rgba & 0xff00ff00ff00ff00) >> 8;
}
-#if defined(USING_NEON_FP16)
- SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
- SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
-#elif defined(USING_NEON)
+#if defined(USING_NEON)
SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
#else
@@ -237,8 +218,6 @@
SI F floor_(F x) {
#if N == 1
return floorf_(x);
-#elif defined(USING_NEON_FP16)
- return vrndmq_f16(x);
#elif defined(__aarch64__)
return vrndmq_f32(x);
#elif defined(USING_AVX512F)
@@ -263,10 +242,6 @@
}
SI F approx_log2(F x) {
-#if defined(USING_NEON_FP16)
- // TODO(mtklein)
- return x;
-#else
// The first approximation of log2(x) is its exponent 'e', minus 127.
I32 bits = bit_pun<I32>(x);
@@ -278,7 +253,6 @@
return e - 124.225514990f
- 1.498030302f*m
- 1.725879990f/(0.3520887068f + m);
-#endif
}
SI F approx_log(F x) {
@@ -287,10 +261,6 @@
}
SI F approx_exp2(F x) {
-#if defined(USING_NEON_FP16)
- // TODO(mtklein)
- return x;
-#else
F fract = x - floor_(x);
F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
@@ -299,7 +269,6 @@
I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
return bit_pun<F>(bits);
-#endif
}
SI F approx_pow(F x, float y) {
@@ -314,11 +283,6 @@
// Return tf(x).
SI F apply_tf(const skcms_TransferFunction* tf, F x) {
-#if defined(USING_NEON_FP16)
- // TODO(mtklein)
- (void)tf;
- return x;
-#else
// Peel off the sign bit and set x = |x|.
U32 bits = bit_pun<U32>(x),
sign = bits & 0x80000000;
@@ -330,15 +294,9 @@
// Tack the sign bit back on.
return bit_pun<F>(sign | bit_pun<U32>(v));
-#endif
}
SI F apply_pq(const skcms_TransferFunction* tf, F x) {
-#if defined(USING_NEON_FP16)
- // TODO(mtklein)
- (void)tf;
- return x;
-#else
U32 bits = bit_pun<U32>(x),
sign = bits & 0x80000000;
x = bit_pun<F>(bits ^ sign);
@@ -348,15 +306,9 @@
tf->f);
return bit_pun<F>(sign | bit_pun<U32>(v));
-#endif
}
SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
-#if defined(USING_NEON_FP16)
- // TODO(mtklein)
- (void)tf;
- return x;
-#else
const float R = tf->a, G = tf->b,
a = tf->c, b = tf->d, c = tf->e,
K = tf->f + 1;
@@ -368,15 +320,9 @@
, approx_exp((x-c)*a) + b);
return K*bit_pun<F>(sign | bit_pun<U32>(v));
-#endif
}
SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
-#if defined(USING_NEON_FP16)
- // TODO(mtklein)
- (void)tf;
- return x;
-#else
const float R = tf->a, G = tf->b,
a = tf->c, b = tf->d, c = tf->e,
K = tf->f + 1;
@@ -389,7 +335,6 @@
, a * approx_log(x - b) + c);
return bit_pun<F>(sign | bit_pun<U32>(v));
-#endif
}
@@ -636,11 +581,7 @@
}
SI F minus_1_ulp(F v) {
-#if defined(USING_NEON_FP16)
- return bit_pun<F>( bit_pun<U16>(v) - 1 );
-#else
return bit_pun<F>( bit_pun<U32>(v) - 1 );
-#endif
}
SI F table(const skcms_Curve* curve, F v) {
@@ -835,23 +776,7 @@
case Op_load_888:{
const uint8_t* rgb = (const uint8_t*)(src + 3*i);
- #if defined(USING_NEON_FP16)
- // See the explanation under USING_NEON below. This is that doubled up.
- uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
- v = vld3q_lane_u8(rgb+ 0, v, 0);
- v = vld3q_lane_u8(rgb+ 3, v, 2);
- v = vld3q_lane_u8(rgb+ 6, v, 4);
- v = vld3q_lane_u8(rgb+ 9, v, 6);
-
- v = vld3q_lane_u8(rgb+12, v, 8);
- v = vld3q_lane_u8(rgb+15, v, 10);
- v = vld3q_lane_u8(rgb+18, v, 12);
- v = vld3q_lane_u8(rgb+21, v, 14);
-
- r = cast<F>((U16)v.val[0]) * (1/255.0f);
- g = cast<F>((U16)v.val[1]) * (1/255.0f);
- b = cast<F>((U16)v.val[2]) * (1/255.0f);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
// There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
// a time. Since we're doing that, we might as well load them into 16-bit lanes.
// (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
@@ -917,12 +842,7 @@
uintptr_t ptr = (uintptr_t)(src + 6*i);
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x3_t v = vld3q_u16(rgb);
- r = cast<F>((U16)v.val[0]) * (1/65535.0f);
- g = cast<F>((U16)v.val[1]) * (1/65535.0f);
- b = cast<F>((U16)v.val[2]) * (1/65535.0f);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x3_t v = vld3_u16(rgb);
r = cast<F>((U16)v.val[0]) * (1/65535.0f);
g = cast<F>((U16)v.val[1]) * (1/65535.0f);
@@ -938,13 +858,7 @@
uintptr_t ptr = (uintptr_t)(src + 8*i);
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x4_t v = vld4q_u16(rgba);
- r = cast<F>((U16)v.val[0]) * (1/65535.0f);
- g = cast<F>((U16)v.val[1]) * (1/65535.0f);
- b = cast<F>((U16)v.val[2]) * (1/65535.0f);
- a = cast<F>((U16)v.val[3]) * (1/65535.0f);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x4_t v = vld4_u16(rgba);
r = cast<F>((U16)v.val[0]) * (1/65535.0f);
g = cast<F>((U16)v.val[1]) * (1/65535.0f);
@@ -964,12 +878,7 @@
uintptr_t ptr = (uintptr_t)(src + 6*i);
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x3_t v = vld3q_u16(rgb);
- r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
- g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
- b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x3_t v = vld3_u16(rgb);
r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
@@ -989,13 +898,7 @@
uintptr_t ptr = (uintptr_t)(src + 8*i);
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x4_t v = vld4q_u16(rgba);
- r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
- g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
- b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
- a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x4_t v = vld4_u16(rgba);
r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
@@ -1015,12 +918,7 @@
uintptr_t ptr = (uintptr_t)(src + 6*i);
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x3_t v = vld3q_u16(rgb);
- U16 R = (U16)v.val[0],
- G = (U16)v.val[1],
- B = (U16)v.val[2];
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x3_t v = vld3_u16(rgb);
U16 R = (U16)v.val[0],
G = (U16)v.val[1],
@@ -1039,13 +937,7 @@
uintptr_t ptr = (uintptr_t)(src + 8*i);
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x4_t v = vld4q_u16(rgba);
- U16 R = (U16)v.val[0],
- G = (U16)v.val[1],
- B = (U16)v.val[2],
- A = (U16)v.val[3];
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x4_t v = vld4_u16(rgba);
U16 R = (U16)v.val[0],
G = (U16)v.val[1],
@@ -1068,13 +960,7 @@
uintptr_t ptr = (uintptr_t)(src + 12*i);
assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
const float* rgb = (const float*)ptr; // cast to const float* to be safe.
- #if defined(USING_NEON_FP16)
- float32x4x3_t lo = vld3q_f32(rgb + 0),
- hi = vld3q_f32(rgb + 12);
- r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
- g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
- b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
float32x4x3_t v = vld3q_f32(rgb);
r = (F)v.val[0];
g = (F)v.val[1];
@@ -1090,14 +976,7 @@
uintptr_t ptr = (uintptr_t)(src + 16*i);
assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
const float* rgba = (const float*)ptr; // cast to const float* to be safe.
- #if defined(USING_NEON_FP16)
- float32x4x4_t lo = vld4q_f32(rgba + 0),
- hi = vld4q_f32(rgba + 16);
- r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
- g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
- b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
- a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
float32x4x4_t v = vld4q_f32(rgba);
r = (F)v.val[0];
g = (F)v.val[1];
@@ -1280,23 +1159,7 @@
case Op_store_888: {
uint8_t* rgb = (uint8_t*)dst + 3*i;
- #if defined(USING_NEON_FP16)
- // See the explanation under USING_NEON below. This is that doubled up.
- U16 R = to_fixed(r * 255),
- G = to_fixed(g * 255),
- B = to_fixed(b * 255);
-
- uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
- vst3q_lane_u8(rgb+ 0, v, 0);
- vst3q_lane_u8(rgb+ 3, v, 2);
- vst3q_lane_u8(rgb+ 6, v, 4);
- vst3q_lane_u8(rgb+ 9, v, 6);
-
- vst3q_lane_u8(rgb+12, v, 8);
- vst3q_lane_u8(rgb+15, v, 10);
- vst3q_lane_u8(rgb+18, v, 12);
- vst3q_lane_u8(rgb+21, v, 14);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
// Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
// get there via U16 to save some instructions converting to float. And just
// like load_888, we'd prefer to go via U32 but for ARMv7 support.
@@ -1343,14 +1206,7 @@
uintptr_t ptr = (uintptr_t)(dst + 6*i);
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x3_t v = {{
- (uint16x8_t)U16_from_F(r),
- (uint16x8_t)U16_from_F(g),
- (uint16x8_t)U16_from_F(b),
- }};
- vst3q_u16(rgb, v);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x3_t v = {{
(uint16x4_t)U16_from_F(r),
(uint16x4_t)U16_from_F(g),
@@ -1369,15 +1225,7 @@
uintptr_t ptr = (uintptr_t)(dst + 8*i);
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x4_t v = {{
- (uint16x8_t)U16_from_F(r),
- (uint16x8_t)U16_from_F(g),
- (uint16x8_t)U16_from_F(b),
- (uint16x8_t)U16_from_F(a),
- }};
- vst4q_u16(rgba, v);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x4_t v = {{
(uint16x4_t)U16_from_F(r),
(uint16x4_t)U16_from_F(g),
@@ -1398,14 +1246,7 @@
uintptr_t ptr = (uintptr_t)(dst + 6*i);
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x3_t v = {{
- (uint16x8_t)swap_endian_16(U16_from_F(r)),
- (uint16x8_t)swap_endian_16(U16_from_F(g)),
- (uint16x8_t)swap_endian_16(U16_from_F(b)),
- }};
- vst3q_u16(rgb, v);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x3_t v = {{
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
@@ -1427,15 +1268,7 @@
uintptr_t ptr = (uintptr_t)(dst + 8*i);
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
- #if defined(USING_NEON_FP16)
- uint16x8x4_t v = {{
- (uint16x8_t)swap_endian_16(U16_from_F(r)),
- (uint16x8_t)swap_endian_16(U16_from_F(g)),
- (uint16x8_t)swap_endian_16(U16_from_F(b)),
- (uint16x8_t)swap_endian_16(U16_from_F(a)),
- }};
- vst4q_u16(rgba, v);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x4_t v = {{
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
@@ -1460,14 +1293,7 @@
U16 R = Half_from_F(r),
G = Half_from_F(g),
B = Half_from_F(b);
- #if defined(USING_NEON_FP16)
- uint16x8x3_t v = {{
- (uint16x8_t)R,
- (uint16x8_t)G,
- (uint16x8_t)B,
- }};
- vst3q_u16(rgb, v);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x3_t v = {{
(uint16x4_t)R,
(uint16x4_t)G,
@@ -1490,15 +1316,7 @@
G = Half_from_F(g),
B = Half_from_F(b),
A = Half_from_F(a);
- #if defined(USING_NEON_FP16)
- uint16x8x4_t v = {{
- (uint16x8_t)R,
- (uint16x8_t)G,
- (uint16x8_t)B,
- (uint16x8_t)A,
- }};
- vst4q_u16(rgba, v);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
uint16x4x4_t v = {{
(uint16x4_t)R,
(uint16x4_t)G,
@@ -1519,19 +1337,7 @@
uintptr_t ptr = (uintptr_t)(dst + 12*i);
assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
float* rgb = (float*)ptr; // for this cast to float* to be safe.
- #if defined(USING_NEON_FP16)
- float32x4x3_t lo = {{
- vcvt_f32_f16(vget_low_f16(r)),
- vcvt_f32_f16(vget_low_f16(g)),
- vcvt_f32_f16(vget_low_f16(b)),
- }}, hi = {{
- vcvt_f32_f16(vget_high_f16(r)),
- vcvt_f32_f16(vget_high_f16(g)),
- vcvt_f32_f16(vget_high_f16(b)),
- }};
- vst3q_f32(rgb + 0, lo);
- vst3q_f32(rgb + 12, hi);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
float32x4x3_t v = {{
(float32x4_t)r,
(float32x4_t)g,
@@ -1549,21 +1355,7 @@
uintptr_t ptr = (uintptr_t)(dst + 16*i);
assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
float* rgba = (float*)ptr; // for this cast to float* to be safe.
- #if defined(USING_NEON_FP16)
- float32x4x4_t lo = {{
- vcvt_f32_f16(vget_low_f16(r)),
- vcvt_f32_f16(vget_low_f16(g)),
- vcvt_f32_f16(vget_low_f16(b)),
- vcvt_f32_f16(vget_low_f16(a)),
- }}, hi = {{
- vcvt_f32_f16(vget_high_f16(r)),
- vcvt_f32_f16(vget_high_f16(g)),
- vcvt_f32_f16(vget_high_f16(b)),
- vcvt_f32_f16(vget_high_f16(a)),
- }};
- vst4q_f32(rgba + 0, lo);
- vst4q_f32(rgba + 16, hi);
- #elif defined(USING_NEON)
+ #if defined(USING_NEON)
float32x4x4_t v = {{
(float32x4_t)r,
(float32x4_t)g,
@@ -1621,8 +1413,5 @@
#if defined(USING_NEON_F16C)
#undef USING_NEON_F16C
#endif
-#if defined(USING_NEON_FP16)
- #undef USING_NEON_FP16
-#endif
#undef FALLTHROUGH
diff --git a/tests.c b/tests.c
index c4ca414..04a7a28 100644
--- a/tests.c
+++ b/tests.c
@@ -18,12 +18,6 @@
#include <stdlib.h>
#include <string.h>
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
- static bool kFP16 = true;
-#else
- static bool kFP16 = false;
-#endif
-
#if defined(_MSC_VER)
#define DEBUGBREAK __debugbreak
#elif defined(__clang__)
@@ -53,7 +47,7 @@
double ratio = (X < Y) ? X / Y \
: (Y < X) ? Y / X \
: 1.0; \
- if (ratio < (kFP16 ? 0.995 : 1.0)) { \
+ if (ratio < 1.0) { \
fprintf(stderr, "expect_close(" #x "==%g, " #y "==%g) failed at %s:%d\n", \
X,Y, __FILE__,__LINE__); \
fflush(stderr); /* stderr is buffered on Windows. */ \
@@ -319,7 +313,7 @@
// so the low lanes are actually the most significant byte, and the high least.
expect(dst[ 0] == 0x03020100);
- expect(dst[ 8127] == (kFP16 ? 0xfffefdfc : 0xfefefdfc));
+ expect(dst[ 8127] == 0xfefefdfc);
expect(dst[16383] == 0xfffefdfc);
// We've lost precision when transforming to 8-bit, so these won't quite round-trip.
@@ -353,7 +347,7 @@
expect(dst[0] == 0xff020100);
expect(dst[1] == 0xfffdfc03);
- expect(dst[2] == (kFP16 ? 0xfffcfffe : 0xfffcfefe));
+ expect(dst[2] == 0xfffcfefe);
expect(dst[3] == 0xfffffefd);
// We've lost precision when transforming to 8-bit, so these won't quite round-trip.
@@ -1925,12 +1919,10 @@
test_ParseWithA2BPriority();
test_B2A();
- // Temporarily disable some tests while getting FP16 compute working.
- if (!kFP16) {
- test_Parse(regenTestData);
- test_sRGB_AllBytes();
- test_TRC_Table16();
- }
+ test_Parse(regenTestData);
+ test_sRGB_AllBytes();
+ test_TRC_Table16();
+
#if 0
test_CLUT();
#endif