use O(1) pragmas instead of O(N) attributes
Both Clang and GCC now have ways to set a target attribute
using a pragma for a section of code. We can do this for
the extra x86 Haswell slice. skcms.o stay identical.
Change-Id: I07d76df1aac71b166254e02ac9139508513ca2ee
Reviewed-on: https://skia-review.googlesource.com/150962
Auto-Submit: Mike Klein <mtklein@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/skcms.cc b/skcms.cc
index f6d004c..551c932 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -1873,17 +1873,25 @@
using U8 = Vec<N,uint8_t>;
#endif
- #define ATTR
#include "src/Transform_inl.h"
#undef N
- #undef ATTR
}
// Now, instantiate any other versions of run_program() we may want for runtime detection.
#if !defined(SKCMS_PORTABLE) && (defined(__clang__) || defined(__GNUC__)) \
&& defined(__x86_64__) && !defined(__AVX2__)
+ #if defined(__clang__)
+ #pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
+ #elif defined(__GNUC__)
+ #pragma GCC push_options
+ #pragma GCC target("avx2,f16c")
+ #endif
+
namespace hsw {
+ #define USING_AVX
+ #define USING_AVX_F16C
+ #define USING_AVX2
#define N 8
using F = Vec<N,float>;
using I32 = Vec<N,int32_t>;
@@ -1892,18 +1900,18 @@
using U16 = Vec<N,uint16_t>;
using U8 = Vec<N,uint8_t>;
- #define ATTR __attribute__((target("avx2,f16c")))
- #define USING_AVX
- #define USING_AVX_F16C
- #define USING_AVX2
-
#include "src/Transform_inl.h"
- #undef N
- #undef ATTR
// src/Transform_inl.h will undefine USING_* for us.
+ #undef N
}
+ #if defined(__clang__)
+ #pragma clang attribute pop
+ #elif defined(__GNUC__)
+ #pragma GCC pop_options
+ #endif
+
#define TEST_FOR_HSW
static bool hsw_ok() {
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index effc44a..6a79843 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -9,7 +9,6 @@
// This file is included from skcms.cc with some pre-defined macros:
// N: depth of all vectors, 1,4,8, or 16
-// ATTR: an __attribute__ to apply to functions
// and inside a namespace, with some types already defined:
// F: a vector of N float
// I32: a vector of N int32_t
@@ -81,20 +80,20 @@
#endif
template <typename T, typename P>
-SI ATTR T load(const P* ptr) {
+SI T load(const P* ptr) {
T val;
small_memcpy(&val, ptr, sizeof(val));
return val;
}
template <typename T, typename P>
-SI ATTR void store(P* ptr, const T& val) {
+SI void store(P* ptr, const T& val) {
small_memcpy(ptr, &val, sizeof(val));
}
// (T)v is a cast when N == 1 and a bit-pun when N>1,
// so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
template <typename D, typename S>
-SI ATTR D cast(const S& v) {
+SI D cast(const S& v) {
#if N == 1
return (D)v;
#elif defined(__clang__)
@@ -110,7 +109,7 @@
}
template <typename D, typename S>
-SI ATTR D bit_pun(const S& v) {
+SI D bit_pun(const S& v) {
static_assert(sizeof(D) == sizeof(v), "");
return load<D>(&v);
}
@@ -118,10 +117,10 @@
// When we convert from float to fixed point, it's very common to want to round,
// and for some reason compilers generate better code when converting to int32_t.
// To serve both those ends, we use this function to_fixed() instead of direct cast().
-SI ATTR I32 to_fixed(F f) { return cast<I32>(f + 0.5f); }
+SI I32 to_fixed(F f) { return cast<I32>(f + 0.5f); }
template <typename T>
-SI ATTR T if_then_else(I32 cond, T t, T e) {
+SI T if_then_else(I32 cond, T t, T e) {
#if N == 1
return cond ? t : e;
#else
@@ -130,7 +129,7 @@
#endif
}
-SI ATTR F F_from_Half(U16 half) {
+SI F F_from_Half(U16 half) {
#if defined(USING_NEON_F16C)
return vcvt_f32_f16((float16x4_t)half);
#elif defined(__AVX512F__)
@@ -157,7 +156,7 @@
// we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
__attribute__((no_sanitize("unsigned-integer-overflow")))
#endif
-SI ATTR U16 Half_from_F(F f) {
+SI U16 Half_from_F(F f) {
#if defined(USING_NEON_F16C)
return (U16)vcvt_f16_f32(f);
#elif defined(__AVX512F__)
@@ -178,25 +177,25 @@
// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
#if defined(USING_NEON)
- SI ATTR U16 swap_endian_16(U16 v) {
+ SI U16 swap_endian_16(U16 v) {
return (U16)vrev16_u8((uint8x8_t) v);
}
#endif
-SI ATTR U64 swap_endian_16x4(const U64& rgba) {
+SI U64 swap_endian_16x4(const U64& rgba) {
return (rgba & 0x00ff00ff00ff00ff) << 8
| (rgba & 0xff00ff00ff00ff00) >> 8;
}
#if defined(USING_NEON)
- SI ATTR F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
- SI ATTR F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
+ SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
+ SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
#else
- SI ATTR F min_(F x, F y) { return if_then_else(x > y, y, x); }
- SI ATTR F max_(F x, F y) { return if_then_else(x < y, y, x); }
+ SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
+ SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
#endif
-SI ATTR F floor_(F x) {
+SI F floor_(F x) {
#if N == 1
return floorf_(x);
#elif defined(__aarch64__)
@@ -218,7 +217,7 @@
#endif
}
-SI ATTR F approx_log2(F x) {
+SI F approx_log2(F x) {
// The first approximation of log2(x) is its exponent 'e', minus 127.
I32 bits = bit_pun<I32>(x);
@@ -232,7 +231,7 @@
- 1.725879990f/(0.3520887068f + m);
}
-SI ATTR F approx_exp2(F x) {
+SI F approx_exp2(F x) {
F fract = x - floor_(x);
I32 bits = cast<I32>((1.0f * (1<<23)) * (x + 121.274057500f
@@ -241,13 +240,13 @@
return bit_pun<F>(bits);
}
-SI ATTR F approx_pow(F x, float y) {
+SI F approx_pow(F x, float y) {
return if_then_else((x == F0) | (x == F1), x
, approx_exp2(approx_log2(x) * y));
}
// Return tf(x).
-SI ATTR F apply_tf(const skcms_TransferFunction* tf, F x) {
+SI F apply_tf(const skcms_TransferFunction* tf, F x) {
// Peel off the sign bit and set x = |x|.
U32 bits = bit_pun<U32>(x),
sign = bits & 0x80000000;
@@ -264,7 +263,7 @@
// Strided loads and stores of N values, starting from p.
template <typename T, typename P>
-SI ATTR T load_3(const P* p) {
+SI T load_3(const P* p) {
#if N == 1
return (T)p[0];
#elif N == 4
@@ -278,7 +277,7 @@
}
template <typename T, typename P>
-SI ATTR T load_4(const P* p) {
+SI T load_4(const P* p) {
#if N == 1
return (T)p[0];
#elif N == 4
@@ -292,7 +291,7 @@
}
template <typename T, typename P>
-SI ATTR void store_3(P* p, const T& v) {
+SI void store_3(P* p, const T& v) {
#if N == 1
p[0] = v;
#elif N == 4
@@ -309,7 +308,7 @@
}
template <typename T, typename P>
-SI ATTR void store_4(P* p, const T& v) {
+SI void store_4(P* p, const T& v) {
#if N == 1
p[0] = v;
#elif N == 4
@@ -326,7 +325,7 @@
}
-SI ATTR U8 gather_8(const uint8_t* p, I32 ix) {
+SI U8 gather_8(const uint8_t* p, I32 ix) {
#if N == 1
U8 v = p[ix];
#elif N == 4
@@ -344,11 +343,11 @@
}
// Helper for gather_16(), loading the ix'th 16-bit value from p.
-SI ATTR uint16_t load_16(const uint8_t* p, int ix) {
+SI uint16_t load_16(const uint8_t* p, int ix) {
return load<uint16_t>(p + 2*ix);
}
-SI ATTR U16 gather_16(const uint8_t* p, I32 ix) {
+SI U16 gather_16(const uint8_t* p, I32 ix) {
#if N == 1
U16 v = load_16(p,ix);
#elif N == 4
@@ -367,15 +366,15 @@
#if !defined(USING_AVX2)
// Helpers for gather_24/48(), loading the ix'th 24/48-bit value from p, and 1/2 extra bytes.
- SI ATTR uint32_t load_24_32(const uint8_t* p, int ix) {
+ SI uint32_t load_24_32(const uint8_t* p, int ix) {
return load<uint32_t>(p + 3*ix);
}
- SI ATTR uint64_t load_48_64(const uint8_t* p, int ix) {
+ SI uint64_t load_48_64(const uint8_t* p, int ix) {
return load<uint64_t>(p + 6*ix);
}
#endif
-SI ATTR U32 gather_24(const uint8_t* p, I32 ix) {
+SI U32 gather_24(const uint8_t* p, I32 ix) {
// First, back up a byte. Any place we're gathering from has a safe junk byte to read
// in front of it, either a previous table value, or some tag metadata.
p -= 1;
@@ -411,7 +410,7 @@
}
#if !defined(__arm__)
- SI ATTR void gather_48(const uint8_t* p, I32 ix, U64* v) {
+ SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
// As in gather_24(), with everything doubled.
p -= 2;
@@ -462,22 +461,22 @@
}
#endif
-SI ATTR F F_from_U8(U8 v) {
+SI F F_from_U8(U8 v) {
return cast<F>(v) * (1/255.0f);
}
-SI ATTR F F_from_U16_BE(U16 v) {
+SI F F_from_U16_BE(U16 v) {
// All 16-bit ICC values are big-endian, so we byte swap before converting to float.
// MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
v = (U16)( ((v<<8)|(v>>8)) & 0xffff );
return cast<F>(v) * (1/65535.0f);
}
-SI ATTR F minus_1_ulp(F v) {
+SI F minus_1_ulp(F v) {
return bit_pun<F>( bit_pun<I32>(v) - 1 );
}
-SI ATTR F table_8(const skcms_Curve* curve, F v) {
+SI F table_8(const skcms_Curve* curve, F v) {
// Clamp the input to [0,1], then scale to a table index.
F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
@@ -495,7 +494,7 @@
return l + (h-l)*t;
}
-SI ATTR F table_16(const skcms_Curve* curve, F v) {
+SI F table_16(const skcms_Curve* curve, F v) {
// All just as in table_8() until the gathers.
F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
@@ -511,7 +510,7 @@
}
// Color lookup tables, by input dimension and bit depth.
-SI ATTR void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+SI void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
U32 rgb = gather_24(a2b->grid_8, ix);
*r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
@@ -521,7 +520,7 @@
(void)a;
(void)stride;
}
-SI ATTR void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+SI void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
#if defined(__arm__)
// This is up to 2x faster on 32-bit ARM than the #else-case fast path.
*r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
@@ -552,7 +551,7 @@
// These are all the same basic approach: handle one dimension, then the rest recursively.
// We let "I" be the current dimension, and "J" the previous dimension, I-1. "B" is the bit depth.
#define DEF_CLUT(I,J,B) \
- MAYBE_SI ATTR \
+ MAYBE_SI \
void clut_##I##_##B(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \
I32 limit = cast<I32>(F0); \
limit += a2b->grid_points[I-1]; \
@@ -585,7 +584,7 @@
DEF_CLUT(3,2,16)
DEF_CLUT(4,3,16)
-ATTR
+
static void exec_ops(const Op* ops, const void** args,
const char* src, char* dst, int i) {
F r = F0, g = F0, b = F0, a = F0;
@@ -1117,7 +1116,7 @@
}
}
-ATTR
+
static void run_program(const Op* program, const void** arguments,
const char* src, char* dst, int n,
const size_t src_bpp, const size_t dst_bpp) {