use O(1) pragmas instead of O(N) attributes

Both Clang and GCC now have ways to set a target attribute
using a pragma for a section of code.   We can do this for
the extra x86 Haswell slice.  skcms.o stay identical.

Change-Id: I07d76df1aac71b166254e02ac9139508513ca2ee
Reviewed-on: https://skia-review.googlesource.com/150962
Auto-Submit: Mike Klein <mtklein@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/skcms.cc b/skcms.cc
index f6d004c..551c932 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -1873,17 +1873,25 @@
     using  U8 = Vec<N,uint8_t>;
 #endif
 
-    #define ATTR
     #include "src/Transform_inl.h"
     #undef N
-    #undef ATTR
 }
 
 // Now, instantiate any other versions of run_program() we may want for runtime detection.
 #if !defined(SKCMS_PORTABLE) && (defined(__clang__) || defined(__GNUC__)) \
         && defined(__x86_64__) && !defined(__AVX2__)
 
+    #if defined(__clang__)
+        #pragma clang attribute push(__attribute__((target("avx2,f16c"))), apply_to=function)
+    #elif defined(__GNUC__)
+        #pragma GCC push_options
+        #pragma GCC target("avx2,f16c")
+    #endif
+
     namespace hsw {
+        #define USING_AVX
+        #define USING_AVX_F16C
+        #define USING_AVX2
         #define N 8
         using   F = Vec<N,float>;
         using I32 = Vec<N,int32_t>;
@@ -1892,18 +1900,18 @@
         using U16 = Vec<N,uint16_t>;
         using  U8 = Vec<N,uint8_t>;
 
-        #define ATTR __attribute__((target("avx2,f16c")))
-        #define USING_AVX
-        #define USING_AVX_F16C
-        #define USING_AVX2
-
         #include "src/Transform_inl.h"
 
-        #undef N
-        #undef ATTR
         // src/Transform_inl.h will undefine USING_* for us.
+        #undef N
     }
 
+    #if defined(__clang__)
+        #pragma clang attribute pop
+    #elif defined(__GNUC__)
+        #pragma GCC pop_options
+    #endif
+
     #define TEST_FOR_HSW
 
     static bool hsw_ok() {
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index effc44a..6a79843 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -9,7 +9,6 @@
 
 // This file is included from skcms.cc with some pre-defined macros:
 //    N:    depth of all vectors, 1,4,8, or 16
-//    ATTR:   an __attribute__ to apply to functions
 // and inside a namespace, with some types already defined:
 //    F:    a vector of N float
 //    I32:  a vector of N int32_t
@@ -81,20 +80,20 @@
 #endif
 
 template <typename T, typename P>
-SI ATTR T load(const P* ptr) {
+SI T load(const P* ptr) {
     T val;
     small_memcpy(&val, ptr, sizeof(val));
     return val;
 }
 template <typename T, typename P>
-SI ATTR void store(P* ptr, const T& val) {
+SI void store(P* ptr, const T& val) {
     small_memcpy(ptr, &val, sizeof(val));
 }
 
 // (T)v is a cast when N == 1 and a bit-pun when N>1,
 // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
 template <typename D, typename S>
-SI ATTR D cast(const S& v) {
+SI D cast(const S& v) {
 #if N == 1
     return (D)v;
 #elif defined(__clang__)
@@ -110,7 +109,7 @@
 }
 
 template <typename D, typename S>
-SI ATTR D bit_pun(const S& v) {
+SI D bit_pun(const S& v) {
     static_assert(sizeof(D) == sizeof(v), "");
     return load<D>(&v);
 }
@@ -118,10 +117,10 @@
 // When we convert from float to fixed point, it's very common to want to round,
 // and for some reason compilers generate better code when converting to int32_t.
 // To serve both those ends, we use this function to_fixed() instead of direct cast().
-SI ATTR I32 to_fixed(F f) {  return cast<I32>(f + 0.5f); }
+SI I32 to_fixed(F f) {  return cast<I32>(f + 0.5f); }
 
 template <typename T>
-SI ATTR T if_then_else(I32 cond, T t, T e) {
+SI T if_then_else(I32 cond, T t, T e) {
 #if N == 1
     return cond ? t : e;
 #else
@@ -130,7 +129,7 @@
 #endif
 }
 
-SI ATTR F F_from_Half(U16 half) {
+SI F F_from_Half(U16 half) {
 #if defined(USING_NEON_F16C)
     return vcvt_f32_f16((float16x4_t)half);
 #elif defined(__AVX512F__)
@@ -157,7 +156,7 @@
     // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway.
     __attribute__((no_sanitize("unsigned-integer-overflow")))
 #endif
-SI ATTR U16 Half_from_F(F f) {
+SI U16 Half_from_F(F f) {
 #if defined(USING_NEON_F16C)
     return (U16)vcvt_f16_f32(f);
 #elif defined(__AVX512F__)
@@ -178,25 +177,25 @@
 
 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
 #if defined(USING_NEON)
-    SI ATTR U16 swap_endian_16(U16 v) {
+    SI U16 swap_endian_16(U16 v) {
         return (U16)vrev16_u8((uint8x8_t) v);
     }
 #endif
 
-SI ATTR U64 swap_endian_16x4(const U64& rgba) {
+SI U64 swap_endian_16x4(const U64& rgba) {
     return (rgba & 0x00ff00ff00ff00ff) << 8
          | (rgba & 0xff00ff00ff00ff00) >> 8;
 }
 
 #if defined(USING_NEON)
-    SI ATTR F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
-    SI ATTR F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
+    SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
+    SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
 #else
-    SI ATTR F min_(F x, F y) { return if_then_else(x > y, y, x); }
-    SI ATTR F max_(F x, F y) { return if_then_else(x < y, y, x); }
+    SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
+    SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
 #endif
 
-SI ATTR F floor_(F x) {
+SI F floor_(F x) {
 #if N == 1
     return floorf_(x);
 #elif defined(__aarch64__)
@@ -218,7 +217,7 @@
 #endif
 }
 
-SI ATTR F approx_log2(F x) {
+SI F approx_log2(F x) {
     // The first approximation of log2(x) is its exponent 'e', minus 127.
     I32 bits = bit_pun<I32>(x);
 
@@ -232,7 +231,7 @@
              -   1.725879990f/(0.3520887068f + m);
 }
 
-SI ATTR F approx_exp2(F x) {
+SI F approx_exp2(F x) {
     F fract = x - floor_(x);
 
     I32 bits = cast<I32>((1.0f * (1<<23)) * (x + 121.274057500f
@@ -241,13 +240,13 @@
     return bit_pun<F>(bits);
 }
 
-SI ATTR F approx_pow(F x, float y) {
+SI F approx_pow(F x, float y) {
     return if_then_else((x == F0) | (x == F1), x
                                              , approx_exp2(approx_log2(x) * y));
 }
 
 // Return tf(x).
-SI ATTR F apply_tf(const skcms_TransferFunction* tf, F x) {
+SI F apply_tf(const skcms_TransferFunction* tf, F x) {
     // Peel off the sign bit and set x = |x|.
     U32 bits = bit_pun<U32>(x),
         sign = bits & 0x80000000;
@@ -264,7 +263,7 @@
 
 // Strided loads and stores of N values, starting from p.
 template <typename T, typename P>
-SI ATTR T load_3(const P* p) {
+SI T load_3(const P* p) {
 #if N == 1
     return (T)p[0];
 #elif N == 4
@@ -278,7 +277,7 @@
 }
 
 template <typename T, typename P>
-SI ATTR T load_4(const P* p) {
+SI T load_4(const P* p) {
 #if N == 1
     return (T)p[0];
 #elif N == 4
@@ -292,7 +291,7 @@
 }
 
 template <typename T, typename P>
-SI ATTR void store_3(P* p, const T& v) {
+SI void store_3(P* p, const T& v) {
 #if N == 1
     p[0] = v;
 #elif N == 4
@@ -309,7 +308,7 @@
 }
 
 template <typename T, typename P>
-SI ATTR void store_4(P* p, const T& v) {
+SI void store_4(P* p, const T& v) {
 #if N == 1
     p[0] = v;
 #elif N == 4
@@ -326,7 +325,7 @@
 }
 
 
-SI ATTR U8 gather_8(const uint8_t* p, I32 ix) {
+SI U8 gather_8(const uint8_t* p, I32 ix) {
 #if N == 1
     U8 v = p[ix];
 #elif N == 4
@@ -344,11 +343,11 @@
 }
 
 // Helper for gather_16(), loading the ix'th 16-bit value from p.
-SI ATTR uint16_t load_16(const uint8_t* p, int ix) {
+SI uint16_t load_16(const uint8_t* p, int ix) {
     return load<uint16_t>(p + 2*ix);
 }
 
-SI ATTR U16 gather_16(const uint8_t* p, I32 ix) {
+SI U16 gather_16(const uint8_t* p, I32 ix) {
 #if N == 1
     U16 v = load_16(p,ix);
 #elif N == 4
@@ -367,15 +366,15 @@
 
 #if !defined(USING_AVX2)
     // Helpers for gather_24/48(), loading the ix'th 24/48-bit value from p, and 1/2 extra bytes.
-    SI ATTR uint32_t load_24_32(const uint8_t* p, int ix) {
+    SI uint32_t load_24_32(const uint8_t* p, int ix) {
         return load<uint32_t>(p + 3*ix);
     }
-    SI ATTR uint64_t load_48_64(const uint8_t* p, int ix) {
+    SI uint64_t load_48_64(const uint8_t* p, int ix) {
         return load<uint64_t>(p + 6*ix);
     }
 #endif
 
-SI ATTR U32 gather_24(const uint8_t* p, I32 ix) {
+SI U32 gather_24(const uint8_t* p, I32 ix) {
     // First, back up a byte.  Any place we're gathering from has a safe junk byte to read
     // in front of it, either a previous table value, or some tag metadata.
     p -= 1;
@@ -411,7 +410,7 @@
 }
 
 #if !defined(__arm__)
-    SI ATTR void gather_48(const uint8_t* p, I32 ix, U64* v) {
+    SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
         // As in gather_24(), with everything doubled.
         p -= 2;
 
@@ -462,22 +461,22 @@
     }
 #endif
 
-SI ATTR F F_from_U8(U8 v) {
+SI F F_from_U8(U8 v) {
     return cast<F>(v) * (1/255.0f);
 }
 
-SI ATTR F F_from_U16_BE(U16 v) {
+SI F F_from_U16_BE(U16 v) {
     // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
     // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
     v = (U16)( ((v<<8)|(v>>8)) & 0xffff );
     return cast<F>(v) * (1/65535.0f);
 }
 
-SI ATTR F minus_1_ulp(F v) {
+SI F minus_1_ulp(F v) {
     return bit_pun<F>( bit_pun<I32>(v) - 1 );
 }
 
-SI ATTR F table_8(const skcms_Curve* curve, F v) {
+SI F table_8(const skcms_Curve* curve, F v) {
     // Clamp the input to [0,1], then scale to a table index.
     F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
 
@@ -495,7 +494,7 @@
     return l + (h-l)*t;
 }
 
-SI ATTR F table_16(const skcms_Curve* curve, F v) {
+SI F table_16(const skcms_Curve* curve, F v) {
     // All just as in table_8() until the gathers.
     F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
 
@@ -511,7 +510,7 @@
 }
 
 // Color lookup tables, by input dimension and bit depth.
-SI ATTR void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+SI void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
     U32 rgb = gather_24(a2b->grid_8, ix);
 
     *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
@@ -521,7 +520,7 @@
     (void)a;
     (void)stride;
 }
-SI ATTR void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+SI void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
     #if defined(__arm__)
         // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
         *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
@@ -552,7 +551,7 @@
 // These are all the same basic approach: handle one dimension, then the rest recursively.
 // We let "I" be the current dimension, and "J" the previous dimension, I-1.  "B" is the bit depth.
 #define DEF_CLUT(I,J,B)                                                                    \
-    MAYBE_SI ATTR                                                                          \
+    MAYBE_SI \
     void clut_##I##_##B(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \
         I32 limit = cast<I32>(F0);                                                         \
         limit += a2b->grid_points[I-1];                                                    \
@@ -585,7 +584,7 @@
 DEF_CLUT(3,2,16)
 DEF_CLUT(4,3,16)
 
-ATTR
+
 static void exec_ops(const Op* ops, const void** args,
                      const char* src, char* dst, int i) {
     F r = F0, g = F0, b = F0, a = F0;
@@ -1117,7 +1116,7 @@
     }
 }
 
-ATTR
+
 static void run_program(const Op* program, const void** arguments,
                         const char* src, char* dst, int n,
                         const size_t src_bpp, const size_t dst_bpp) {