test and fix f16<->f32 conversion stages

This refactors from_half() and to_half() a bit, totally
reimplementing the non-hardware cases to be more clearly correct.

CQ_INCLUDE_TRYBOTS=skia.primary:Test-Android-Clang-PixelC-CPU-TegraX1-arm64-Release-Android,Test-Android-Clang-Ci20-CPU-IngenicJZ4780-mipsel-Release-Android,Test-Android-Clang-Nexus10-CPU-Exynos5250-arm-Release-Android,Test-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Release,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86-Debug,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Debug

Change-Id: I439463cf90935c5e8fe2369cbcf45e07f3af62c7
Reviewed-on: https://skia-review.googlesource.com/13921
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Matt Sarett <msarett@google.com>
diff --git a/gn/tests.gni b/gn/tests.gni
index be368f0..5955a32 100644
--- a/gn/tests.gni
+++ b/gn/tests.gni
@@ -63,6 +63,7 @@
   "$_tests/EGLImageTest.cpp",
   "$_tests/EmptyPathTest.cpp",
   "$_tests/ExifTest.cpp",
+  "$_tests/F16StagesTest.cpp",
   "$_tests/FillPathTest.cpp",
   "$_tests/FitsInTest.cpp",
   "$_tests/FlattenableCustomFactory.cpp",
diff --git a/src/core/SkHalf.h b/src/core/SkHalf.h
index dd978a2..f6c7615 100644
--- a/src/core/SkHalf.h
+++ b/src/core/SkHalf.h
@@ -16,7 +16,7 @@
 // only used for storage
 typedef uint16_t SkHalf;
 
-static constexpr uint16_t SK_HalfMin     = 0x0400; // 2^-24  (minimum positive normal value)
+static constexpr uint16_t SK_HalfMin     = 0x0400; // 2^-14  (minimum positive normal value)
 static constexpr uint16_t SK_HalfMax     = 0x7bff; // 65504
 static constexpr uint16_t SK_HalfEpsilon = 0x1400; // 2^-10
 static constexpr uint16_t SK_Half1       = 0x3C00; // 1
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 9aa29d0..a66e059 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -16190,91 +16190,158 @@
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,17,1,0,0                     // jne           4367 <_sk_load_f16_avx+0x11f>
+  .byte  197,252,17,124,36,200               // vmovups       %ymm7,-0x38(%rsp)
+  .byte  197,252,17,116,36,168               // vmovups       %ymm6,-0x58(%rsp)
+  .byte  197,252,17,108,36,136               // vmovups       %ymm5,-0x78(%rsp)
+  .byte  15,133,101,2,0,0                    // jne           44cd <_sk_load_f16_avx+0x285>
   .byte  197,121,16,4,248                    // vmovupd       (%rax,%rdi,8),%xmm8
   .byte  197,249,16,84,248,16                // vmovupd       0x10(%rax,%rdi,8),%xmm2
-  .byte  197,249,16,92,248,32                // vmovupd       0x20(%rax,%rdi,8),%xmm3
+  .byte  197,249,16,76,248,32                // vmovupd       0x20(%rax,%rdi,8),%xmm1
   .byte  197,122,111,76,248,48               // vmovdqu       0x30(%rax,%rdi,8),%xmm9
   .byte  197,185,97,194                      // vpunpcklwd    %xmm2,%xmm8,%xmm0
   .byte  197,185,105,210                     // vpunpckhwd    %xmm2,%xmm8,%xmm2
-  .byte  196,193,97,97,201                   // vpunpcklwd    %xmm9,%xmm3,%xmm1
-  .byte  196,193,97,105,217                  // vpunpckhwd    %xmm9,%xmm3,%xmm3
-  .byte  197,121,97,218                      // vpunpcklwd    %xmm2,%xmm0,%xmm11
+  .byte  196,193,113,97,217                  // vpunpcklwd    %xmm9,%xmm1,%xmm3
+  .byte  196,193,113,105,201                 // vpunpckhwd    %xmm9,%xmm1,%xmm1
+  .byte  197,121,97,242                      // vpunpcklwd    %xmm2,%xmm0,%xmm14
   .byte  197,121,105,194                     // vpunpckhwd    %xmm2,%xmm0,%xmm8
-  .byte  197,241,97,211                      // vpunpcklwd    %xmm3,%xmm1,%xmm2
-  .byte  197,113,105,203                     // vpunpckhwd    %xmm3,%xmm1,%xmm9
-  .byte  197,161,108,194                     // vpunpcklqdq   %xmm2,%xmm11,%xmm0
-  .byte  184,0,4,0,4                         // mov           $0x4000400,%eax
-  .byte  197,249,110,200                     // vmovd         %eax,%xmm1
-  .byte  197,121,112,233,0                   // vpshufd       $0x0,%xmm1,%xmm13
-  .byte  197,145,101,200                     // vpcmpgtw      %xmm0,%xmm13,%xmm1
-  .byte  197,241,223,192                     // vpandn        %xmm0,%xmm1,%xmm0
-  .byte  196,226,121,51,200                  // vpmovzxwd     %xmm0,%xmm1
-  .byte  196,65,41,239,210                   // vpxor         %xmm10,%xmm10,%xmm10
-  .byte  196,193,121,105,194                 // vpunpckhwd    %xmm10,%xmm0,%xmm0
-  .byte  197,241,114,241,13                  // vpslld        $0xd,%xmm1,%xmm1
-  .byte  197,249,114,240,13                  // vpslld        $0xd,%xmm0,%xmm0
-  .byte  196,227,117,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
-  .byte  184,0,0,128,119                     // mov           $0x77800000,%eax
+  .byte  197,97,97,249                       // vpunpcklwd    %xmm1,%xmm3,%xmm15
+  .byte  197,97,105,209                      // vpunpckhwd    %xmm1,%xmm3,%xmm10
+  .byte  196,193,9,108,199                   // vpunpcklqdq   %xmm15,%xmm14,%xmm0
+  .byte  196,65,25,239,228                   // vpxor         %xmm12,%xmm12,%xmm12
+  .byte  196,193,121,105,204                 // vpunpckhwd    %xmm12,%xmm0,%xmm1
+  .byte  196,226,121,51,192                  // vpmovzxwd     %xmm0,%xmm0
+  .byte  196,227,125,24,193,1                // vinsertf128   $0x1,%xmm1,%ymm0,%ymm0
+  .byte  184,0,128,0,0                       // mov           $0x8000,%eax
   .byte  197,249,110,200                     // vmovd         %eax,%xmm1
   .byte  197,249,112,201,0                   // vpshufd       $0x0,%xmm1,%xmm1
-  .byte  196,99,117,24,225,1                 // vinsertf128   $0x1,%xmm1,%ymm1,%ymm12
-  .byte  197,156,89,192                      // vmulps        %ymm0,%ymm12,%ymm0
-  .byte  197,161,109,202                     // vpunpckhqdq   %xmm2,%xmm11,%xmm1
-  .byte  197,145,101,209                     // vpcmpgtw      %xmm1,%xmm13,%xmm2
-  .byte  197,233,223,201                     // vpandn        %xmm1,%xmm2,%xmm1
-  .byte  196,226,121,51,209                  // vpmovzxwd     %xmm1,%xmm2
-  .byte  196,193,113,105,202                 // vpunpckhwd    %xmm10,%xmm1,%xmm1
-  .byte  197,233,114,242,13                  // vpslld        $0xd,%xmm2,%xmm2
+  .byte  196,99,117,24,201,1                 // vinsertf128   $0x1,%xmm1,%ymm1,%ymm9
+  .byte  196,193,124,84,201                  // vandps        %ymm9,%ymm0,%ymm1
+  .byte  184,0,124,0,0                       // mov           $0x7c00,%eax
+  .byte  197,249,110,216                     // vmovd         %eax,%xmm3
+  .byte  197,249,112,219,0                   // vpshufd       $0x0,%xmm3,%xmm3
+  .byte  196,99,101,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm3,%ymm11
+  .byte  196,193,124,84,219                  // vandps        %ymm11,%ymm0,%ymm3
+  .byte  197,252,87,193                      // vxorps        %ymm1,%ymm0,%ymm0
+  .byte  196,227,125,25,218,1                // vextractf128  $0x1,%ymm3,%xmm2
+  .byte  196,193,105,118,212                 // vpcmpeqd      %xmm12,%xmm2,%xmm2
+  .byte  196,193,97,118,220                  // vpcmpeqd      %xmm12,%xmm3,%xmm3
+  .byte  196,227,101,24,242,1                // vinsertf128   $0x1,%xmm2,%ymm3,%ymm6
+  .byte  196,227,125,25,203,1                // vextractf128  $0x1,%ymm1,%xmm3
+  .byte  197,145,114,243,16                  // vpslld        $0x10,%xmm3,%xmm13
+  .byte  196,227,125,25,195,1                // vextractf128  $0x1,%ymm0,%xmm3
+  .byte  197,233,114,243,13                  // vpslld        $0xd,%xmm3,%xmm2
+  .byte  184,0,0,0,56                        // mov           $0x38000000,%eax
+  .byte  197,249,110,216                     // vmovd         %eax,%xmm3
+  .byte  197,249,112,219,0                   // vpshufd       $0x0,%xmm3,%xmm3
+  .byte  197,145,254,251                     // vpaddd        %xmm3,%xmm13,%xmm7
+  .byte  197,193,254,210                     // vpaddd        %xmm2,%xmm7,%xmm2
+  .byte  197,241,114,241,16                  // vpslld        $0x10,%xmm1,%xmm1
+  .byte  197,249,114,240,13                  // vpslld        $0xd,%xmm0,%xmm0
+  .byte  197,241,254,203                     // vpaddd        %xmm3,%xmm1,%xmm1
+  .byte  197,241,254,192                     // vpaddd        %xmm0,%xmm1,%xmm0
+  .byte  196,227,125,24,194,1                // vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
+  .byte  196,65,20,87,237                    // vxorps        %ymm13,%ymm13,%ymm13
+  .byte  196,195,125,74,197,96               // vblendvps     %ymm6,%ymm13,%ymm0,%ymm0
+  .byte  196,193,9,109,207                   // vpunpckhqdq   %xmm15,%xmm14,%xmm1
+  .byte  196,193,113,105,212                 // vpunpckhwd    %xmm12,%xmm1,%xmm2
+  .byte  196,226,121,51,201                  // vpmovzxwd     %xmm1,%xmm1
+  .byte  196,227,117,24,202,1                // vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
+  .byte  196,193,116,84,209                  // vandps        %ymm9,%ymm1,%ymm2
+  .byte  196,193,116,84,243                  // vandps        %ymm11,%ymm1,%ymm6
+  .byte  197,244,87,202                      // vxorps        %ymm2,%ymm1,%ymm1
+  .byte  196,227,125,25,247,1                // vextractf128  $0x1,%ymm6,%xmm7
+  .byte  196,193,65,118,252                  // vpcmpeqd      %xmm12,%xmm7,%xmm7
+  .byte  196,193,73,118,244                  // vpcmpeqd      %xmm12,%xmm6,%xmm6
+  .byte  196,99,77,24,247,1                  // vinsertf128   $0x1,%xmm7,%ymm6,%ymm14
+  .byte  196,227,125,25,215,1                // vextractf128  $0x1,%ymm2,%xmm7
+  .byte  197,193,114,247,16                  // vpslld        $0x10,%xmm7,%xmm7
+  .byte  196,227,125,25,206,1                // vextractf128  $0x1,%ymm1,%xmm6
+  .byte  197,201,114,246,13                  // vpslld        $0xd,%xmm6,%xmm6
+  .byte  197,193,254,251                     // vpaddd        %xmm3,%xmm7,%xmm7
+  .byte  197,193,254,246                     // vpaddd        %xmm6,%xmm7,%xmm6
+  .byte  197,233,114,242,16                  // vpslld        $0x10,%xmm2,%xmm2
   .byte  197,241,114,241,13                  // vpslld        $0xd,%xmm1,%xmm1
-  .byte  196,227,109,24,201,1                // vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
-  .byte  197,156,89,201                      // vmulps        %ymm1,%ymm12,%ymm1
-  .byte  196,193,57,108,209                  // vpunpcklqdq   %xmm9,%xmm8,%xmm2
-  .byte  197,145,101,218                     // vpcmpgtw      %xmm2,%xmm13,%xmm3
-  .byte  197,225,223,210                     // vpandn        %xmm2,%xmm3,%xmm2
-  .byte  196,226,121,51,218                  // vpmovzxwd     %xmm2,%xmm3
-  .byte  196,193,105,105,210                 // vpunpckhwd    %xmm10,%xmm2,%xmm2
-  .byte  197,225,114,243,13                  // vpslld        $0xd,%xmm3,%xmm3
+  .byte  197,233,254,211                     // vpaddd        %xmm3,%xmm2,%xmm2
+  .byte  197,233,254,201                     // vpaddd        %xmm1,%xmm2,%xmm1
+  .byte  196,227,117,24,206,1                // vinsertf128   $0x1,%xmm6,%ymm1,%ymm1
+  .byte  196,195,117,74,205,224              // vblendvps     %ymm14,%ymm13,%ymm1,%ymm1
+  .byte  196,193,57,108,210                  // vpunpcklqdq   %xmm10,%xmm8,%xmm2
+  .byte  196,193,105,105,244                 // vpunpckhwd    %xmm12,%xmm2,%xmm6
+  .byte  196,226,121,51,210                  // vpmovzxwd     %xmm2,%xmm2
+  .byte  196,227,109,24,214,1                // vinsertf128   $0x1,%xmm6,%ymm2,%ymm2
+  .byte  196,193,108,84,243                  // vandps        %ymm11,%ymm2,%ymm6
+  .byte  196,227,125,25,247,1                // vextractf128  $0x1,%ymm6,%xmm7
+  .byte  196,193,65,118,252                  // vpcmpeqd      %xmm12,%xmm7,%xmm7
+  .byte  196,193,73,118,244                  // vpcmpeqd      %xmm12,%xmm6,%xmm6
+  .byte  196,99,77,24,247,1                  // vinsertf128   $0x1,%xmm7,%ymm6,%ymm14
+  .byte  196,193,108,84,249                  // vandps        %ymm9,%ymm2,%ymm7
+  .byte  197,236,87,215                      // vxorps        %ymm7,%ymm2,%ymm2
+  .byte  196,227,125,25,254,1                // vextractf128  $0x1,%ymm7,%xmm6
+  .byte  197,129,114,246,16                  // vpslld        $0x10,%xmm6,%xmm15
+  .byte  196,227,125,25,214,1                // vextractf128  $0x1,%ymm2,%xmm6
+  .byte  197,209,114,246,13                  // vpslld        $0xd,%xmm6,%xmm5
+  .byte  197,129,254,243                     // vpaddd        %xmm3,%xmm15,%xmm6
+  .byte  197,201,254,237                     // vpaddd        %xmm5,%xmm6,%xmm5
+  .byte  197,201,114,247,16                  // vpslld        $0x10,%xmm7,%xmm6
   .byte  197,233,114,242,13                  // vpslld        $0xd,%xmm2,%xmm2
-  .byte  196,227,101,24,210,1                // vinsertf128   $0x1,%xmm2,%ymm3,%ymm2
-  .byte  197,156,89,210                      // vmulps        %ymm2,%ymm12,%ymm2
-  .byte  196,65,57,109,193                   // vpunpckhqdq   %xmm9,%xmm8,%xmm8
-  .byte  196,193,17,101,216                  // vpcmpgtw      %xmm8,%xmm13,%xmm3
-  .byte  196,193,97,223,216                  // vpandn        %xmm8,%xmm3,%xmm3
-  .byte  196,98,121,51,195                   // vpmovzxwd     %xmm3,%xmm8
-  .byte  196,193,97,105,218                  // vpunpckhwd    %xmm10,%xmm3,%xmm3
-  .byte  196,193,57,114,240,13               // vpslld        $0xd,%xmm8,%xmm8
-  .byte  197,225,114,243,13                  // vpslld        $0xd,%xmm3,%xmm3
-  .byte  196,227,61,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  .byte  197,156,89,219                      // vmulps        %ymm3,%ymm12,%ymm3
+  .byte  197,201,254,243                     // vpaddd        %xmm3,%xmm6,%xmm6
+  .byte  197,201,254,210                     // vpaddd        %xmm2,%xmm6,%xmm2
+  .byte  196,227,109,24,213,1                // vinsertf128   $0x1,%xmm5,%ymm2,%ymm2
+  .byte  196,195,109,74,213,224              // vblendvps     %ymm14,%ymm13,%ymm2,%ymm2
+  .byte  196,193,57,109,234                  // vpunpckhqdq   %xmm10,%xmm8,%xmm5
+  .byte  196,193,81,105,244                  // vpunpckhwd    %xmm12,%xmm5,%xmm6
+  .byte  196,226,121,51,237                  // vpmovzxwd     %xmm5,%xmm5
+  .byte  196,227,85,24,238,1                 // vinsertf128   $0x1,%xmm6,%ymm5,%ymm5
+  .byte  196,193,84,84,243                   // vandps        %ymm11,%ymm5,%ymm6
+  .byte  196,227,125,25,247,1                // vextractf128  $0x1,%ymm6,%xmm7
+  .byte  196,193,65,118,252                  // vpcmpeqd      %xmm12,%xmm7,%xmm7
+  .byte  196,193,73,118,244                  // vpcmpeqd      %xmm12,%xmm6,%xmm6
+  .byte  196,65,84,84,193                    // vandps        %ymm9,%ymm5,%ymm8
+  .byte  196,193,84,87,232                   // vxorps        %ymm8,%ymm5,%ymm5
+  .byte  196,99,77,24,207,1                  // vinsertf128   $0x1,%xmm7,%ymm6,%ymm9
+  .byte  196,99,125,25,199,1                 // vextractf128  $0x1,%ymm8,%xmm7
+  .byte  197,193,114,247,16                  // vpslld        $0x10,%xmm7,%xmm7
+  .byte  196,193,73,114,240,16               // vpslld        $0x10,%xmm8,%xmm6
+  .byte  197,201,254,243                     // vpaddd        %xmm3,%xmm6,%xmm6
+  .byte  197,193,254,219                     // vpaddd        %xmm3,%xmm7,%xmm3
+  .byte  196,227,125,25,239,1                // vextractf128  $0x1,%ymm5,%xmm7
+  .byte  197,193,114,247,13                  // vpslld        $0xd,%xmm7,%xmm7
+  .byte  197,225,254,223                     // vpaddd        %xmm7,%xmm3,%xmm3
+  .byte  197,209,114,245,13                  // vpslld        $0xd,%xmm5,%xmm5
+  .byte  197,201,254,237                     // vpaddd        %xmm5,%xmm6,%xmm5
+  .byte  196,227,85,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm5,%ymm3
+  .byte  196,195,101,74,221,144              // vblendvps     %ymm9,%ymm13,%ymm3,%ymm3
   .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  197,252,16,108,36,136               // vmovups       -0x78(%rsp),%ymm5
+  .byte  197,252,16,116,36,168               // vmovups       -0x58(%rsp),%ymm6
+  .byte  197,252,16,124,36,200               // vmovups       -0x38(%rsp),%ymm7
   .byte  255,224                             // jmpq          *%rax
   .byte  197,123,16,4,248                    // vmovsd        (%rax,%rdi,8),%xmm8
   .byte  196,65,49,239,201                   // vpxor         %xmm9,%xmm9,%xmm9
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,79                              // je            43c6 <_sk_load_f16_avx+0x17e>
+  .byte  116,79                              // je            452c <_sk_load_f16_avx+0x2e4>
   .byte  197,57,22,68,248,8                  // vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,67                              // jb            43c6 <_sk_load_f16_avx+0x17e>
+  .byte  114,67                              // jb            452c <_sk_load_f16_avx+0x2e4>
   .byte  197,251,16,84,248,16                // vmovsd        0x10(%rax,%rdi,8),%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  116,68                              // je            43d3 <_sk_load_f16_avx+0x18b>
+  .byte  116,68                              // je            4539 <_sk_load_f16_avx+0x2f1>
   .byte  197,233,22,84,248,24                // vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,56                              // jb            43d3 <_sk_load_f16_avx+0x18b>
-  .byte  197,251,16,92,248,32                // vmovsd        0x20(%rax,%rdi,8),%xmm3
+  .byte  114,56                              // jb            4539 <_sk_load_f16_avx+0x2f1>
+  .byte  197,251,16,76,248,32                // vmovsd        0x20(%rax,%rdi,8),%xmm1
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  15,132,194,254,255,255              // je            426d <_sk_load_f16_avx+0x25>
-  .byte  197,225,22,92,248,40                // vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
+  .byte  15,132,110,253,255,255              // je            427f <_sk_load_f16_avx+0x37>
+  .byte  197,241,22,76,248,40                // vmovhpd       0x28(%rax,%rdi,8),%xmm1,%xmm1
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,178,254,255,255              // jb            426d <_sk_load_f16_avx+0x25>
+  .byte  15,130,94,253,255,255               // jb            427f <_sk_load_f16_avx+0x37>
   .byte  197,122,126,76,248,48               // vmovq         0x30(%rax,%rdi,8),%xmm9
-  .byte  233,167,254,255,255                 // jmpq          426d <_sk_load_f16_avx+0x25>
-  .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
+  .byte  233,83,253,255,255                  // jmpq          427f <_sk_load_f16_avx+0x37>
+  .byte  197,241,87,201                      // vxorpd        %xmm1,%xmm1,%xmm1
   .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,154,254,255,255                 // jmpq          426d <_sk_load_f16_avx+0x25>
-  .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,145,254,255,255                 // jmpq          426d <_sk_load_f16_avx+0x25>
+  .byte  233,70,253,255,255                  // jmpq          427f <_sk_load_f16_avx+0x37>
+  .byte  197,241,87,201                      // vxorpd        %xmm1,%xmm1,%xmm1
+  .byte  233,61,253,255,255                  // jmpq          427f <_sk_load_f16_avx+0x37>
 
 HIDDEN _sk_gather_f16_avx
 .globl _sk_gather_f16_avx
@@ -16284,6 +16351,11 @@
   .byte  65,86                               // push          %r14
   .byte  65,84                               // push          %r12
   .byte  83                                  // push          %rbx
+  .byte  72,131,236,24                       // sub           $0x18,%rsp
+  .byte  197,252,17,124,36,224               // vmovups       %ymm7,-0x20(%rsp)
+  .byte  197,252,17,116,36,192               // vmovups       %ymm6,-0x40(%rsp)
+  .byte  197,252,17,108,36,160               // vmovups       %ymm5,-0x60(%rsp)
+  .byte  197,252,17,100,36,128               // vmovups       %ymm4,-0x80(%rsp)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  197,254,91,209                      // vcvttps2dq    %ymm1,%ymm2
@@ -16324,55 +16396,121 @@
   .byte  197,177,105,201                     // vpunpckhwd    %xmm1,%xmm9,%xmm1
   .byte  197,169,97,211                      // vpunpcklwd    %xmm3,%xmm10,%xmm2
   .byte  197,169,105,219                     // vpunpckhwd    %xmm3,%xmm10,%xmm3
-  .byte  197,121,97,217                      // vpunpcklwd    %xmm1,%xmm0,%xmm11
+  .byte  197,121,97,241                      // vpunpcklwd    %xmm1,%xmm0,%xmm14
   .byte  197,121,105,193                     // vpunpckhwd    %xmm1,%xmm0,%xmm8
-  .byte  197,233,97,203                      // vpunpcklwd    %xmm3,%xmm2,%xmm1
-  .byte  197,105,105,203                     // vpunpckhwd    %xmm3,%xmm2,%xmm9
-  .byte  197,161,108,193                     // vpunpcklqdq   %xmm1,%xmm11,%xmm0
-  .byte  184,0,4,0,4                         // mov           $0x4000400,%eax
-  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
-  .byte  197,121,112,234,0                   // vpshufd       $0x0,%xmm2,%xmm13
-  .byte  197,145,101,208                     // vpcmpgtw      %xmm0,%xmm13,%xmm2
-  .byte  197,233,223,192                     // vpandn        %xmm0,%xmm2,%xmm0
-  .byte  196,226,121,51,208                  // vpmovzxwd     %xmm0,%xmm2
-  .byte  196,65,41,239,210                   // vpxor         %xmm10,%xmm10,%xmm10
-  .byte  196,193,121,105,194                 // vpunpckhwd    %xmm10,%xmm0,%xmm0
-  .byte  197,233,114,242,13                  // vpslld        $0xd,%xmm2,%xmm2
-  .byte  197,249,114,240,13                  // vpslld        $0xd,%xmm0,%xmm0
-  .byte  196,227,109,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm2,%ymm0
-  .byte  184,0,0,128,119                     // mov           $0x77800000,%eax
+  .byte  197,105,97,251                      // vpunpcklwd    %xmm3,%xmm2,%xmm15
+  .byte  197,105,105,211                     // vpunpckhwd    %xmm3,%xmm2,%xmm10
+  .byte  196,193,9,108,199                   // vpunpcklqdq   %xmm15,%xmm14,%xmm0
+  .byte  196,65,25,239,228                   // vpxor         %xmm12,%xmm12,%xmm12
+  .byte  196,193,121,105,212                 // vpunpckhwd    %xmm12,%xmm0,%xmm2
+  .byte  196,226,121,51,192                  // vpmovzxwd     %xmm0,%xmm0
+  .byte  196,227,125,24,194,1                // vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
+  .byte  184,0,128,0,0                       // mov           $0x8000,%eax
   .byte  197,249,110,208                     // vmovd         %eax,%xmm2
   .byte  197,249,112,210,0                   // vpshufd       $0x0,%xmm2,%xmm2
-  .byte  196,99,109,24,226,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm12
-  .byte  197,156,89,192                      // vmulps        %ymm0,%ymm12,%ymm0
-  .byte  197,161,109,201                     // vpunpckhqdq   %xmm1,%xmm11,%xmm1
-  .byte  197,145,101,209                     // vpcmpgtw      %xmm1,%xmm13,%xmm2
-  .byte  197,233,223,201                     // vpandn        %xmm1,%xmm2,%xmm1
-  .byte  196,226,121,51,209                  // vpmovzxwd     %xmm1,%xmm2
-  .byte  196,193,113,105,202                 // vpunpckhwd    %xmm10,%xmm1,%xmm1
-  .byte  197,233,114,242,13                  // vpslld        $0xd,%xmm2,%xmm2
+  .byte  196,99,109,24,202,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm9
+  .byte  196,193,124,84,209                  // vandps        %ymm9,%ymm0,%ymm2
+  .byte  184,0,124,0,0                       // mov           $0x7c00,%eax
+  .byte  197,249,110,216                     // vmovd         %eax,%xmm3
+  .byte  197,249,112,219,0                   // vpshufd       $0x0,%xmm3,%xmm3
+  .byte  196,99,101,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm3,%ymm11
+  .byte  196,193,124,84,219                  // vandps        %ymm11,%ymm0,%ymm3
+  .byte  197,252,87,194                      // vxorps        %ymm2,%ymm0,%ymm0
+  .byte  196,227,125,25,217,1                // vextractf128  $0x1,%ymm3,%xmm1
+  .byte  196,193,113,118,204                 // vpcmpeqd      %xmm12,%xmm1,%xmm1
+  .byte  196,193,97,118,220                  // vpcmpeqd      %xmm12,%xmm3,%xmm3
+  .byte  196,227,101,24,225,1                // vinsertf128   $0x1,%xmm1,%ymm3,%ymm4
+  .byte  196,227,125,25,211,1                // vextractf128  $0x1,%ymm2,%xmm3
+  .byte  197,145,114,243,16                  // vpslld        $0x10,%xmm3,%xmm13
+  .byte  196,227,125,25,195,1                // vextractf128  $0x1,%ymm0,%xmm3
+  .byte  197,241,114,243,13                  // vpslld        $0xd,%xmm3,%xmm1
+  .byte  184,0,0,0,56                        // mov           $0x38000000,%eax
+  .byte  197,249,110,216                     // vmovd         %eax,%xmm3
+  .byte  197,249,112,219,0                   // vpshufd       $0x0,%xmm3,%xmm3
+  .byte  197,145,254,251                     // vpaddd        %xmm3,%xmm13,%xmm7
+  .byte  197,193,254,201                     // vpaddd        %xmm1,%xmm7,%xmm1
+  .byte  197,233,114,242,16                  // vpslld        $0x10,%xmm2,%xmm2
+  .byte  197,249,114,240,13                  // vpslld        $0xd,%xmm0,%xmm0
+  .byte  197,233,254,211                     // vpaddd        %xmm3,%xmm2,%xmm2
+  .byte  197,233,254,192                     // vpaddd        %xmm0,%xmm2,%xmm0
+  .byte  196,227,125,24,193,1                // vinsertf128   $0x1,%xmm1,%ymm0,%ymm0
+  .byte  196,65,20,87,237                    // vxorps        %ymm13,%ymm13,%ymm13
+  .byte  196,195,125,74,197,64               // vblendvps     %ymm4,%ymm13,%ymm0,%ymm0
+  .byte  196,193,9,109,207                   // vpunpckhqdq   %xmm15,%xmm14,%xmm1
+  .byte  196,193,113,105,212                 // vpunpckhwd    %xmm12,%xmm1,%xmm2
+  .byte  196,226,121,51,201                  // vpmovzxwd     %xmm1,%xmm1
+  .byte  196,227,117,24,202,1                // vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
+  .byte  196,193,116,84,209                  // vandps        %ymm9,%ymm1,%ymm2
+  .byte  196,193,116,84,227                  // vandps        %ymm11,%ymm1,%ymm4
+  .byte  197,244,87,202                      // vxorps        %ymm2,%ymm1,%ymm1
+  .byte  196,227,125,25,231,1                // vextractf128  $0x1,%ymm4,%xmm7
+  .byte  196,193,65,118,252                  // vpcmpeqd      %xmm12,%xmm7,%xmm7
+  .byte  196,193,89,118,228                  // vpcmpeqd      %xmm12,%xmm4,%xmm4
+  .byte  196,227,93,24,231,1                 // vinsertf128   $0x1,%xmm7,%ymm4,%ymm4
+  .byte  196,227,125,25,215,1                // vextractf128  $0x1,%ymm2,%xmm7
+  .byte  197,193,114,247,16                  // vpslld        $0x10,%xmm7,%xmm7
+  .byte  196,227,125,25,206,1                // vextractf128  $0x1,%ymm1,%xmm6
+  .byte  197,201,114,246,13                  // vpslld        $0xd,%xmm6,%xmm6
+  .byte  197,193,254,251                     // vpaddd        %xmm3,%xmm7,%xmm7
+  .byte  197,193,254,246                     // vpaddd        %xmm6,%xmm7,%xmm6
+  .byte  197,233,114,242,16                  // vpslld        $0x10,%xmm2,%xmm2
   .byte  197,241,114,241,13                  // vpslld        $0xd,%xmm1,%xmm1
-  .byte  196,227,109,24,201,1                // vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
-  .byte  197,156,89,201                      // vmulps        %ymm1,%ymm12,%ymm1
-  .byte  196,193,57,108,209                  // vpunpcklqdq   %xmm9,%xmm8,%xmm2
-  .byte  197,145,101,218                     // vpcmpgtw      %xmm2,%xmm13,%xmm3
-  .byte  197,225,223,210                     // vpandn        %xmm2,%xmm3,%xmm2
-  .byte  196,226,121,51,218                  // vpmovzxwd     %xmm2,%xmm3
-  .byte  196,193,105,105,210                 // vpunpckhwd    %xmm10,%xmm2,%xmm2
-  .byte  197,225,114,243,13                  // vpslld        $0xd,%xmm3,%xmm3
+  .byte  197,233,254,211                     // vpaddd        %xmm3,%xmm2,%xmm2
+  .byte  197,233,254,201                     // vpaddd        %xmm1,%xmm2,%xmm1
+  .byte  196,227,117,24,206,1                // vinsertf128   $0x1,%xmm6,%ymm1,%ymm1
+  .byte  196,195,117,74,205,64               // vblendvps     %ymm4,%ymm13,%ymm1,%ymm1
+  .byte  196,193,57,108,210                  // vpunpcklqdq   %xmm10,%xmm8,%xmm2
+  .byte  196,193,105,105,228                 // vpunpckhwd    %xmm12,%xmm2,%xmm4
+  .byte  196,226,121,51,210                  // vpmovzxwd     %xmm2,%xmm2
+  .byte  196,227,109,24,212,1                // vinsertf128   $0x1,%xmm4,%ymm2,%ymm2
+  .byte  196,193,108,84,227                  // vandps        %ymm11,%ymm2,%ymm4
+  .byte  196,227,125,25,230,1                // vextractf128  $0x1,%ymm4,%xmm6
+  .byte  196,193,73,118,244                  // vpcmpeqd      %xmm12,%xmm6,%xmm6
+  .byte  196,193,89,118,228                  // vpcmpeqd      %xmm12,%xmm4,%xmm4
+  .byte  196,227,93,24,230,1                 // vinsertf128   $0x1,%xmm6,%ymm4,%ymm4
+  .byte  196,193,108,84,241                  // vandps        %ymm9,%ymm2,%ymm6
+  .byte  197,236,87,214                      // vxorps        %ymm6,%ymm2,%ymm2
+  .byte  196,227,125,25,247,1                // vextractf128  $0x1,%ymm6,%xmm7
+  .byte  197,193,114,247,16                  // vpslld        $0x10,%xmm7,%xmm7
+  .byte  196,227,125,25,213,1                // vextractf128  $0x1,%ymm2,%xmm5
+  .byte  197,209,114,245,13                  // vpslld        $0xd,%xmm5,%xmm5
+  .byte  197,193,254,251                     // vpaddd        %xmm3,%xmm7,%xmm7
+  .byte  197,193,254,237                     // vpaddd        %xmm5,%xmm7,%xmm5
+  .byte  197,201,114,246,16                  // vpslld        $0x10,%xmm6,%xmm6
   .byte  197,233,114,242,13                  // vpslld        $0xd,%xmm2,%xmm2
-  .byte  196,227,101,24,210,1                // vinsertf128   $0x1,%xmm2,%ymm3,%ymm2
-  .byte  197,156,89,210                      // vmulps        %ymm2,%ymm12,%ymm2
-  .byte  196,65,57,109,193                   // vpunpckhqdq   %xmm9,%xmm8,%xmm8
-  .byte  196,193,17,101,216                  // vpcmpgtw      %xmm8,%xmm13,%xmm3
-  .byte  196,193,97,223,216                  // vpandn        %xmm8,%xmm3,%xmm3
-  .byte  196,98,121,51,195                   // vpmovzxwd     %xmm3,%xmm8
-  .byte  196,193,97,105,218                  // vpunpckhwd    %xmm10,%xmm3,%xmm3
-  .byte  196,193,57,114,240,13               // vpslld        $0xd,%xmm8,%xmm8
-  .byte  197,225,114,243,13                  // vpslld        $0xd,%xmm3,%xmm3
-  .byte  196,227,61,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  .byte  197,156,89,219                      // vmulps        %ymm3,%ymm12,%ymm3
+  .byte  197,201,254,243                     // vpaddd        %xmm3,%xmm6,%xmm6
+  .byte  197,201,254,210                     // vpaddd        %xmm2,%xmm6,%xmm2
+  .byte  196,227,109,24,213,1                // vinsertf128   $0x1,%xmm5,%ymm2,%ymm2
+  .byte  196,195,109,74,213,64               // vblendvps     %ymm4,%ymm13,%ymm2,%ymm2
+  .byte  196,193,57,109,226                  // vpunpckhqdq   %xmm10,%xmm8,%xmm4
+  .byte  196,193,89,105,236                  // vpunpckhwd    %xmm12,%xmm4,%xmm5
+  .byte  196,226,121,51,228                  // vpmovzxwd     %xmm4,%xmm4
+  .byte  196,227,93,24,229,1                 // vinsertf128   $0x1,%xmm5,%ymm4,%ymm4
+  .byte  196,193,92,84,235                   // vandps        %ymm11,%ymm4,%ymm5
+  .byte  196,227,125,25,238,1                // vextractf128  $0x1,%ymm5,%xmm6
+  .byte  196,193,73,118,244                  // vpcmpeqd      %xmm12,%xmm6,%xmm6
+  .byte  196,193,81,118,236                  // vpcmpeqd      %xmm12,%xmm5,%xmm5
+  .byte  196,193,92,84,249                   // vandps        %ymm9,%ymm4,%ymm7
+  .byte  197,220,87,231                      // vxorps        %ymm7,%ymm4,%ymm4
+  .byte  196,227,85,24,238,1                 // vinsertf128   $0x1,%xmm6,%ymm5,%ymm5
+  .byte  196,227,125,25,254,1                // vextractf128  $0x1,%ymm7,%xmm6
+  .byte  197,201,114,246,16                  // vpslld        $0x10,%xmm6,%xmm6
+  .byte  197,193,114,247,16                  // vpslld        $0x10,%xmm7,%xmm7
+  .byte  197,193,254,251                     // vpaddd        %xmm3,%xmm7,%xmm7
+  .byte  197,201,254,219                     // vpaddd        %xmm3,%xmm6,%xmm3
+  .byte  196,227,125,25,230,1                // vextractf128  $0x1,%ymm4,%xmm6
+  .byte  197,201,114,246,13                  // vpslld        $0xd,%xmm6,%xmm6
+  .byte  197,225,254,222                     // vpaddd        %xmm6,%xmm3,%xmm3
+  .byte  197,217,114,244,13                  // vpslld        $0xd,%xmm4,%xmm4
+  .byte  197,193,254,228                     // vpaddd        %xmm4,%xmm7,%xmm4
+  .byte  196,227,93,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm4,%ymm3
+  .byte  196,195,101,74,221,80               // vblendvps     %ymm5,%ymm13,%ymm3,%ymm3
   .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  197,252,16,100,36,128               // vmovups       -0x80(%rsp),%ymm4
+  .byte  197,252,16,108,36,160               // vmovups       -0x60(%rsp),%ymm5
+  .byte  197,252,16,116,36,192               // vmovups       -0x40(%rsp),%ymm6
+  .byte  197,252,16,124,36,224               // vmovups       -0x20(%rsp),%ymm7
+  .byte  72,131,196,24                       // add           $0x18,%rsp
   .byte  91                                  // pop           %rbx
   .byte  65,92                               // pop           %r12
   .byte  65,94                               // pop           %r14
@@ -16383,66 +16521,136 @@
 .globl _sk_store_f16_avx
 FUNCTION(_sk_store_f16_avx)
 _sk_store_f16_avx:
+  .byte  72,131,236,88                       // sub           $0x58,%rsp
+  .byte  197,252,17,124,36,32                // vmovups       %ymm7,0x20(%rsp)
+  .byte  197,252,17,52,36                    // vmovups       %ymm6,(%rsp)
+  .byte  197,252,17,108,36,224               // vmovups       %ymm5,-0x20(%rsp)
+  .byte  197,252,17,100,36,192               // vmovups       %ymm4,-0x40(%rsp)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,0                            // mov           (%rax),%r8
-  .byte  184,0,0,128,7                       // mov           $0x7800000,%eax
+  .byte  184,0,0,0,128                       // mov           $0x80000000,%eax
   .byte  197,121,110,192                     // vmovd         %eax,%xmm8
   .byte  196,65,121,112,192,0                // vpshufd       $0x0,%xmm8,%xmm8
+  .byte  196,67,61,24,200,1                  // vinsertf128   $0x1,%xmm8,%ymm8,%ymm9
+  .byte  197,52,84,208                       // vandps        %ymm0,%ymm9,%ymm10
+  .byte  197,252,17,68,36,128                // vmovups       %ymm0,-0x80(%rsp)
+  .byte  196,65,124,87,218                   // vxorps        %ymm10,%ymm0,%ymm11
+  .byte  184,0,0,128,56                      // mov           $0x38800000,%eax
+  .byte  197,121,110,192                     // vmovd         %eax,%xmm8
+  .byte  196,67,121,4,192,0                  // vpermilps     $0x0,%xmm8,%xmm8
   .byte  196,67,61,24,192,1                  // vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
-  .byte  197,60,89,200                       // vmulps        %ymm0,%ymm8,%ymm9
-  .byte  196,67,125,25,202,1                 // vextractf128  $0x1,%ymm9,%xmm10
-  .byte  196,193,41,114,210,13               // vpsrld        $0xd,%xmm10,%xmm10
-  .byte  196,193,49,114,209,13               // vpsrld        $0xd,%xmm9,%xmm9
-  .byte  196,66,49,43,202                    // vpackusdw     %xmm10,%xmm9,%xmm9
-  .byte  197,60,89,209                       // vmulps        %ymm1,%ymm8,%ymm10
-  .byte  196,67,125,25,211,1                 // vextractf128  $0x1,%ymm10,%xmm11
+  .byte  196,65,36,194,224,1                 // vcmpltps      %ymm8,%ymm11,%ymm12
+  .byte  196,67,125,25,213,1                 // vextractf128  $0x1,%ymm10,%xmm13
+  .byte  196,193,17,114,213,16               // vpsrld        $0x10,%xmm13,%xmm13
+  .byte  196,193,9,114,210,16                // vpsrld        $0x10,%xmm10,%xmm14
+  .byte  196,193,1,114,211,13                // vpsrld        $0xd,%xmm11,%xmm15
+  .byte  196,67,125,25,218,1                 // vextractf128  $0x1,%ymm11,%xmm10
+  .byte  196,193,33,114,210,13               // vpsrld        $0xd,%xmm10,%xmm11
+  .byte  184,0,192,1,0                       // mov           $0x1c000,%eax
+  .byte  197,121,110,208                     // vmovd         %eax,%xmm10
+  .byte  196,65,121,112,210,0                // vpshufd       $0x0,%xmm10,%xmm10
+  .byte  196,65,9,250,242                    // vpsubd        %xmm10,%xmm14,%xmm14
+  .byte  196,65,17,250,234                   // vpsubd        %xmm10,%xmm13,%xmm13
+  .byte  196,65,17,254,219                   // vpaddd        %xmm11,%xmm13,%xmm11
+  .byte  196,65,9,254,239                    // vpaddd        %xmm15,%xmm14,%xmm13
+  .byte  196,67,21,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm13,%ymm13
+  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
+  .byte  196,99,21,74,224,192                // vblendvps     %ymm12,%ymm0,%ymm13,%ymm12
+  .byte  197,52,84,233                       // vandps        %ymm1,%ymm9,%ymm13
+  .byte  197,252,17,76,36,160                // vmovups       %ymm1,-0x60(%rsp)
+  .byte  196,65,116,87,245                   // vxorps        %ymm13,%ymm1,%ymm14
+  .byte  196,67,125,25,239,1                 // vextractf128  $0x1,%ymm13,%xmm15
+  .byte  196,193,1,114,215,16                // vpsrld        $0x10,%xmm15,%xmm15
+  .byte  196,67,125,25,243,1                 // vextractf128  $0x1,%ymm14,%xmm11
   .byte  196,193,33,114,211,13               // vpsrld        $0xd,%xmm11,%xmm11
-  .byte  196,193,41,114,210,13               // vpsrld        $0xd,%xmm10,%xmm10
-  .byte  196,66,41,43,211                    // vpackusdw     %xmm11,%xmm10,%xmm10
-  .byte  197,60,89,218                       // vmulps        %ymm2,%ymm8,%ymm11
-  .byte  196,67,125,25,220,1                 // vextractf128  $0x1,%ymm11,%xmm12
-  .byte  196,193,25,114,212,13               // vpsrld        $0xd,%xmm12,%xmm12
-  .byte  196,193,33,114,211,13               // vpsrld        $0xd,%xmm11,%xmm11
-  .byte  196,66,33,43,220                    // vpackusdw     %xmm12,%xmm11,%xmm11
-  .byte  197,60,89,195                       // vmulps        %ymm3,%ymm8,%ymm8
-  .byte  196,67,125,25,196,1                 // vextractf128  $0x1,%ymm8,%xmm12
-  .byte  196,193,25,114,212,13               // vpsrld        $0xd,%xmm12,%xmm12
-  .byte  196,193,57,114,208,13               // vpsrld        $0xd,%xmm8,%xmm8
-  .byte  196,66,57,43,196                    // vpackusdw     %xmm12,%xmm8,%xmm8
-  .byte  196,65,49,97,226                    // vpunpcklwd    %xmm10,%xmm9,%xmm12
-  .byte  196,65,49,105,234                   // vpunpckhwd    %xmm10,%xmm9,%xmm13
-  .byte  196,65,33,97,200                    // vpunpcklwd    %xmm8,%xmm11,%xmm9
-  .byte  196,65,33,105,192                   // vpunpckhwd    %xmm8,%xmm11,%xmm8
-  .byte  196,65,25,98,217                    // vpunpckldq    %xmm9,%xmm12,%xmm11
-  .byte  196,65,25,106,209                   // vpunpckhdq    %xmm9,%xmm12,%xmm10
-  .byte  196,65,17,98,200                    // vpunpckldq    %xmm8,%xmm13,%xmm9
-  .byte  196,65,17,106,192                   // vpunpckhdq    %xmm8,%xmm13,%xmm8
+  .byte  196,193,1,250,250                   // vpsubd        %xmm10,%xmm15,%xmm7
+  .byte  196,193,65,254,251                  // vpaddd        %xmm11,%xmm7,%xmm7
+  .byte  196,193,73,114,213,16               // vpsrld        $0x10,%xmm13,%xmm6
+  .byte  196,193,73,250,242                  // vpsubd        %xmm10,%xmm6,%xmm6
+  .byte  196,193,81,114,214,13               // vpsrld        $0xd,%xmm14,%xmm5
+  .byte  197,201,254,237                     // vpaddd        %xmm5,%xmm6,%xmm5
+  .byte  196,193,12,194,240,1                // vcmpltps      %ymm8,%ymm14,%ymm6
+  .byte  196,227,85,24,239,1                 // vinsertf128   $0x1,%xmm7,%ymm5,%ymm5
+  .byte  196,99,85,74,232,96                 // vblendvps     %ymm6,%ymm0,%ymm5,%ymm13
+  .byte  197,180,84,234                      // vandps        %ymm2,%ymm9,%ymm5
+  .byte  196,227,125,25,238,1                // vextractf128  $0x1,%ymm5,%xmm6
+  .byte  197,201,114,214,16                  // vpsrld        $0x10,%xmm6,%xmm6
+  .byte  197,236,87,253                      // vxorps        %ymm5,%ymm2,%ymm7
+  .byte  196,227,125,25,252,1                // vextractf128  $0x1,%ymm7,%xmm4
+  .byte  197,217,114,212,13                  // vpsrld        $0xd,%xmm4,%xmm4
+  .byte  196,193,73,250,242                  // vpsubd        %xmm10,%xmm6,%xmm6
+  .byte  197,201,254,228                     // vpaddd        %xmm4,%xmm6,%xmm4
+  .byte  197,209,114,213,16                  // vpsrld        $0x10,%xmm5,%xmm5
+  .byte  196,193,81,250,234                  // vpsubd        %xmm10,%xmm5,%xmm5
+  .byte  197,201,114,215,13                  // vpsrld        $0xd,%xmm7,%xmm6
+  .byte  197,209,254,238                     // vpaddd        %xmm6,%xmm5,%xmm5
+  .byte  196,227,85,24,228,1                 // vinsertf128   $0x1,%xmm4,%ymm5,%ymm4
+  .byte  196,193,68,194,232,1                // vcmpltps      %ymm8,%ymm7,%ymm5
+  .byte  196,227,93,74,224,80                // vblendvps     %ymm5,%ymm0,%ymm4,%ymm4
+  .byte  197,180,84,235                      // vandps        %ymm3,%ymm9,%ymm5
+  .byte  196,227,125,25,238,1                // vextractf128  $0x1,%ymm5,%xmm6
+  .byte  197,201,114,214,16                  // vpsrld        $0x10,%xmm6,%xmm6
+  .byte  197,193,114,213,16                  // vpsrld        $0x10,%xmm5,%xmm7
+  .byte  196,193,65,250,250                  // vpsubd        %xmm10,%xmm7,%xmm7
+  .byte  196,193,73,250,242                  // vpsubd        %xmm10,%xmm6,%xmm6
+  .byte  197,228,87,237                      // vxorps        %ymm5,%ymm3,%ymm5
+  .byte  196,227,125,25,233,1                // vextractf128  $0x1,%ymm5,%xmm1
+  .byte  197,241,114,209,13                  // vpsrld        $0xd,%xmm1,%xmm1
+  .byte  197,201,254,201                     // vpaddd        %xmm1,%xmm6,%xmm1
+  .byte  196,193,84,194,240,1                // vcmpltps      %ymm8,%ymm5,%ymm6
+  .byte  197,209,114,213,13                  // vpsrld        $0xd,%xmm5,%xmm5
+  .byte  197,193,254,237                     // vpaddd        %xmm5,%xmm7,%xmm5
+  .byte  196,227,85,24,201,1                 // vinsertf128   $0x1,%xmm1,%ymm5,%ymm1
+  .byte  196,227,117,74,192,96               // vblendvps     %ymm6,%ymm0,%ymm1,%ymm0
+  .byte  196,99,125,25,225,1                 // vextractf128  $0x1,%ymm12,%xmm1
+  .byte  196,226,25,43,201                   // vpackusdw     %xmm1,%xmm12,%xmm1
+  .byte  196,99,125,25,237,1                 // vextractf128  $0x1,%ymm13,%xmm5
+  .byte  196,226,17,43,237                   // vpackusdw     %xmm5,%xmm13,%xmm5
+  .byte  196,227,125,25,230,1                // vextractf128  $0x1,%ymm4,%xmm6
+  .byte  196,226,89,43,230                   // vpackusdw     %xmm6,%xmm4,%xmm4
+  .byte  196,227,125,25,198,1                // vextractf128  $0x1,%ymm0,%xmm6
+  .byte  196,226,121,43,198                  // vpackusdw     %xmm6,%xmm0,%xmm0
+  .byte  197,241,97,245                      // vpunpcklwd    %xmm5,%xmm1,%xmm6
+  .byte  197,241,105,205                     // vpunpckhwd    %xmm5,%xmm1,%xmm1
+  .byte  197,217,97,232                      // vpunpcklwd    %xmm0,%xmm4,%xmm5
+  .byte  197,217,105,192                     // vpunpckhwd    %xmm0,%xmm4,%xmm0
+  .byte  197,73,98,221                       // vpunpckldq    %xmm5,%xmm6,%xmm11
+  .byte  197,73,106,213                      // vpunpckhdq    %xmm5,%xmm6,%xmm10
+  .byte  197,113,98,200                      // vpunpckldq    %xmm0,%xmm1,%xmm9
+  .byte  197,113,106,192                     // vpunpckhdq    %xmm0,%xmm1,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,31                              // jne           465c <_sk_store_f16_avx+0xd2>
+  .byte  117,70                              // jne           4aa5 <_sk_store_f16_avx+0x23d>
   .byte  196,65,120,17,28,248                // vmovups       %xmm11,(%r8,%rdi,8)
   .byte  196,65,120,17,84,248,16             // vmovups       %xmm10,0x10(%r8,%rdi,8)
   .byte  196,65,120,17,76,248,32             // vmovups       %xmm9,0x20(%r8,%rdi,8)
   .byte  196,65,122,127,68,248,48            // vmovdqu       %xmm8,0x30(%r8,%rdi,8)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  197,252,16,68,36,128                // vmovups       -0x80(%rsp),%ymm0
+  .byte  197,252,16,76,36,160                // vmovups       -0x60(%rsp),%ymm1
+  .byte  197,252,16,100,36,192               // vmovups       -0x40(%rsp),%ymm4
+  .byte  197,252,16,108,36,224               // vmovups       -0x20(%rsp),%ymm5
+  .byte  197,252,16,52,36                    // vmovups       (%rsp),%ymm6
+  .byte  197,252,16,124,36,32                // vmovups       0x20(%rsp),%ymm7
+  .byte  72,131,196,88                       // add           $0x58,%rsp
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,214,28,248               // vmovq         %xmm11,(%r8,%rdi,8)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            4658 <_sk_store_f16_avx+0xce>
+  .byte  116,201                             // je            4a7a <_sk_store_f16_avx+0x212>
   .byte  196,65,121,23,92,248,8              // vmovhpd       %xmm11,0x8(%r8,%rdi,8)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            4658 <_sk_store_f16_avx+0xce>
+  .byte  114,188                             // jb            4a7a <_sk_store_f16_avx+0x212>
   .byte  196,65,121,214,84,248,16            // vmovq         %xmm10,0x10(%r8,%rdi,8)
-  .byte  116,218                             // je            4658 <_sk_store_f16_avx+0xce>
+  .byte  116,179                             // je            4a7a <_sk_store_f16_avx+0x212>
   .byte  196,65,121,23,84,248,24             // vmovhpd       %xmm10,0x18(%r8,%rdi,8)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            4658 <_sk_store_f16_avx+0xce>
+  .byte  114,166                             // jb            4a7a <_sk_store_f16_avx+0x212>
   .byte  196,65,121,214,76,248,32            // vmovq         %xmm9,0x20(%r8,%rdi,8)
-  .byte  116,196                             // je            4658 <_sk_store_f16_avx+0xce>
+  .byte  116,157                             // je            4a7a <_sk_store_f16_avx+0x212>
   .byte  196,65,121,23,76,248,40             // vmovhpd       %xmm9,0x28(%r8,%rdi,8)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,183                             // jb            4658 <_sk_store_f16_avx+0xce>
+  .byte  114,144                             // jb            4a7a <_sk_store_f16_avx+0x212>
   .byte  196,65,121,214,68,248,48            // vmovq         %xmm8,0x30(%r8,%rdi,8)
-  .byte  235,174                             // jmp           4658 <_sk_store_f16_avx+0xce>
+  .byte  235,135                             // jmp           4a7a <_sk_store_f16_avx+0x212>
 
 HIDDEN _sk_load_u16_be_avx
 .globl _sk_load_u16_be_avx
@@ -16452,7 +16660,7 @@
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  72,141,4,189,0,0,0,0                // lea           0x0(,%rdi,4),%rax
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,5,1,0,0                      // jne           47c5 <_sk_load_u16_be_avx+0x11b>
+  .byte  15,133,5,1,0,0                      // jne           4c0e <_sk_load_u16_be_avx+0x11b>
   .byte  196,65,121,16,4,64                  // vmovupd       (%r8,%rax,2),%xmm8
   .byte  196,193,121,16,84,64,16             // vmovupd       0x10(%r8,%rax,2),%xmm2
   .byte  196,193,121,16,92,64,32             // vmovupd       0x20(%r8,%rax,2),%xmm3
@@ -16511,29 +16719,29 @@
   .byte  196,65,123,16,4,64                  // vmovsd        (%r8,%rax,2),%xmm8
   .byte  196,65,49,239,201                   // vpxor         %xmm9,%xmm9,%xmm9
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,85                              // je            482b <_sk_load_u16_be_avx+0x181>
+  .byte  116,85                              // je            4c74 <_sk_load_u16_be_avx+0x181>
   .byte  196,65,57,22,68,64,8                // vmovhpd       0x8(%r8,%rax,2),%xmm8,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,72                              // jb            482b <_sk_load_u16_be_avx+0x181>
+  .byte  114,72                              // jb            4c74 <_sk_load_u16_be_avx+0x181>
   .byte  196,193,123,16,84,64,16             // vmovsd        0x10(%r8,%rax,2),%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  116,72                              // je            4838 <_sk_load_u16_be_avx+0x18e>
+  .byte  116,72                              // je            4c81 <_sk_load_u16_be_avx+0x18e>
   .byte  196,193,105,22,84,64,24             // vmovhpd       0x18(%r8,%rax,2),%xmm2,%xmm2
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,59                              // jb            4838 <_sk_load_u16_be_avx+0x18e>
+  .byte  114,59                              // jb            4c81 <_sk_load_u16_be_avx+0x18e>
   .byte  196,193,123,16,92,64,32             // vmovsd        0x20(%r8,%rax,2),%xmm3
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  15,132,205,254,255,255              // je            46db <_sk_load_u16_be_avx+0x31>
+  .byte  15,132,205,254,255,255              // je            4b24 <_sk_load_u16_be_avx+0x31>
   .byte  196,193,97,22,92,64,40              // vmovhpd       0x28(%r8,%rax,2),%xmm3,%xmm3
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,188,254,255,255              // jb            46db <_sk_load_u16_be_avx+0x31>
+  .byte  15,130,188,254,255,255              // jb            4b24 <_sk_load_u16_be_avx+0x31>
   .byte  196,65,122,126,76,64,48             // vmovq         0x30(%r8,%rax,2),%xmm9
-  .byte  233,176,254,255,255                 // jmpq          46db <_sk_load_u16_be_avx+0x31>
+  .byte  233,176,254,255,255                 // jmpq          4b24 <_sk_load_u16_be_avx+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
   .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,163,254,255,255                 // jmpq          46db <_sk_load_u16_be_avx+0x31>
+  .byte  233,163,254,255,255                 // jmpq          4b24 <_sk_load_u16_be_avx+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,154,254,255,255                 // jmpq          46db <_sk_load_u16_be_avx+0x31>
+  .byte  233,154,254,255,255                 // jmpq          4b24 <_sk_load_u16_be_avx+0x31>
 
 HIDDEN _sk_load_rgb_u16_be_avx
 .globl _sk_load_rgb_u16_be_avx
@@ -16543,7 +16751,7 @@
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  72,141,4,127                        // lea           (%rdi,%rdi,2),%rax
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,8,1,0,0                      // jne           495b <_sk_load_rgb_u16_be_avx+0x11a>
+  .byte  15,133,8,1,0,0                      // jne           4da4 <_sk_load_rgb_u16_be_avx+0x11a>
   .byte  196,193,122,111,4,64                // vmovdqu       (%r8,%rax,2),%xmm0
   .byte  196,193,122,111,84,64,12            // vmovdqu       0xc(%r8,%rax,2),%xmm2
   .byte  196,193,122,111,76,64,24            // vmovdqu       0x18(%r8,%rax,2),%xmm1
@@ -16602,36 +16810,36 @@
   .byte  196,193,121,110,4,64                // vmovd         (%r8,%rax,2),%xmm0
   .byte  196,193,121,196,68,64,4,2           // vpinsrw       $0x2,0x4(%r8,%rax,2),%xmm0,%xmm0
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  117,5                               // jne           4974 <_sk_load_rgb_u16_be_avx+0x133>
-  .byte  233,19,255,255,255                  // jmpq          4887 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  117,5                               // jne           4dbd <_sk_load_rgb_u16_be_avx+0x133>
+  .byte  233,19,255,255,255                  // jmpq          4cd0 <_sk_load_rgb_u16_be_avx+0x46>
   .byte  196,193,121,110,76,64,6             // vmovd         0x6(%r8,%rax,2),%xmm1
   .byte  196,65,113,196,68,64,10,2           // vpinsrw       $0x2,0xa(%r8,%rax,2),%xmm1,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,26                              // jb            49a3 <_sk_load_rgb_u16_be_avx+0x162>
+  .byte  114,26                              // jb            4dec <_sk_load_rgb_u16_be_avx+0x162>
   .byte  196,193,121,110,76,64,12            // vmovd         0xc(%r8,%rax,2),%xmm1
   .byte  196,193,113,196,84,64,16,2          // vpinsrw       $0x2,0x10(%r8,%rax,2),%xmm1,%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  117,10                              // jne           49a8 <_sk_load_rgb_u16_be_avx+0x167>
-  .byte  233,228,254,255,255                 // jmpq          4887 <_sk_load_rgb_u16_be_avx+0x46>
-  .byte  233,223,254,255,255                 // jmpq          4887 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  117,10                              // jne           4df1 <_sk_load_rgb_u16_be_avx+0x167>
+  .byte  233,228,254,255,255                 // jmpq          4cd0 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  233,223,254,255,255                 // jmpq          4cd0 <_sk_load_rgb_u16_be_avx+0x46>
   .byte  196,193,121,110,76,64,18            // vmovd         0x12(%r8,%rax,2),%xmm1
   .byte  196,65,113,196,76,64,22,2           // vpinsrw       $0x2,0x16(%r8,%rax,2),%xmm1,%xmm9
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,26                              // jb            49d7 <_sk_load_rgb_u16_be_avx+0x196>
+  .byte  114,26                              // jb            4e20 <_sk_load_rgb_u16_be_avx+0x196>
   .byte  196,193,121,110,76,64,24            // vmovd         0x18(%r8,%rax,2),%xmm1
   .byte  196,193,113,196,76,64,28,2          // vpinsrw       $0x2,0x1c(%r8,%rax,2),%xmm1,%xmm1
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  117,10                              // jne           49dc <_sk_load_rgb_u16_be_avx+0x19b>
-  .byte  233,176,254,255,255                 // jmpq          4887 <_sk_load_rgb_u16_be_avx+0x46>
-  .byte  233,171,254,255,255                 // jmpq          4887 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  117,10                              // jne           4e25 <_sk_load_rgb_u16_be_avx+0x19b>
+  .byte  233,176,254,255,255                 // jmpq          4cd0 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  233,171,254,255,255                 // jmpq          4cd0 <_sk_load_rgb_u16_be_avx+0x46>
   .byte  196,193,121,110,92,64,30            // vmovd         0x1e(%r8,%rax,2),%xmm3
   .byte  196,65,97,196,92,64,34,2            // vpinsrw       $0x2,0x22(%r8,%rax,2),%xmm3,%xmm11
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,20                              // jb            4a05 <_sk_load_rgb_u16_be_avx+0x1c4>
+  .byte  114,20                              // jb            4e4e <_sk_load_rgb_u16_be_avx+0x1c4>
   .byte  196,193,121,110,92,64,36            // vmovd         0x24(%r8,%rax,2),%xmm3
   .byte  196,193,97,196,92,64,40,2           // vpinsrw       $0x2,0x28(%r8,%rax,2),%xmm3,%xmm3
-  .byte  233,130,254,255,255                 // jmpq          4887 <_sk_load_rgb_u16_be_avx+0x46>
-  .byte  233,125,254,255,255                 // jmpq          4887 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  233,130,254,255,255                 // jmpq          4cd0 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  233,125,254,255,255                 // jmpq          4cd0 <_sk_load_rgb_u16_be_avx+0x46>
 
 HIDDEN _sk_store_u16_be_avx
 .globl _sk_store_u16_be_avx
@@ -16681,7 +16889,7 @@
   .byte  196,65,17,98,200                    // vpunpckldq    %xmm8,%xmm13,%xmm9
   .byte  196,65,17,106,192                   // vpunpckhdq    %xmm8,%xmm13,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,31                              // jne           4b0c <_sk_store_u16_be_avx+0x102>
+  .byte  117,31                              // jne           4f55 <_sk_store_u16_be_avx+0x102>
   .byte  196,1,120,17,28,72                  // vmovups       %xmm11,(%r8,%r9,2)
   .byte  196,1,120,17,84,72,16               // vmovups       %xmm10,0x10(%r8,%r9,2)
   .byte  196,1,120,17,76,72,32               // vmovups       %xmm9,0x20(%r8,%r9,2)
@@ -16690,22 +16898,22 @@
   .byte  255,224                             // jmpq          *%rax
   .byte  196,1,121,214,28,72                 // vmovq         %xmm11,(%r8,%r9,2)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            4b08 <_sk_store_u16_be_avx+0xfe>
+  .byte  116,240                             // je            4f51 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,23,92,72,8                // vmovhpd       %xmm11,0x8(%r8,%r9,2)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            4b08 <_sk_store_u16_be_avx+0xfe>
+  .byte  114,227                             // jb            4f51 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,214,84,72,16              // vmovq         %xmm10,0x10(%r8,%r9,2)
-  .byte  116,218                             // je            4b08 <_sk_store_u16_be_avx+0xfe>
+  .byte  116,218                             // je            4f51 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,23,84,72,24               // vmovhpd       %xmm10,0x18(%r8,%r9,2)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            4b08 <_sk_store_u16_be_avx+0xfe>
+  .byte  114,205                             // jb            4f51 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,214,76,72,32              // vmovq         %xmm9,0x20(%r8,%r9,2)
-  .byte  116,196                             // je            4b08 <_sk_store_u16_be_avx+0xfe>
+  .byte  116,196                             // je            4f51 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,23,76,72,40               // vmovhpd       %xmm9,0x28(%r8,%r9,2)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,183                             // jb            4b08 <_sk_store_u16_be_avx+0xfe>
+  .byte  114,183                             // jb            4f51 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,214,68,72,48              // vmovq         %xmm8,0x30(%r8,%r9,2)
-  .byte  235,174                             // jmp           4b08 <_sk_store_u16_be_avx+0xfe>
+  .byte  235,174                             // jmp           4f51 <_sk_store_u16_be_avx+0xfe>
 
 HIDDEN _sk_load_f32_avx
 .globl _sk_load_f32_avx
@@ -16713,10 +16921,10 @@
 _sk_load_f32_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  119,110                             // ja            4bd0 <_sk_load_f32_avx+0x76>
+  .byte  119,110                             // ja            5019 <_sk_load_f32_avx+0x76>
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
-  .byte  76,141,21,132,0,0,0                 // lea           0x84(%rip),%r10        # 4bf8 <_sk_load_f32_avx+0x9e>
+  .byte  76,141,21,135,0,0,0                 // lea           0x87(%rip),%r10        # 5044 <_sk_load_f32_avx+0xa1>
   .byte  73,99,4,138                         // movslq        (%r10,%rcx,4),%rax
   .byte  76,1,208                            // add           %r10,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -16742,19 +16950,21 @@
   .byte  196,193,101,21,216                  // vunpckhpd     %ymm8,%ymm3,%ymm3
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
-  .byte  133,255                             // test          %edi,%edi
+  .byte  15,31,0                             // nopl          (%rax)
+  .byte  130                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  255,204                             // dec           %esp
+  .byte  255                                 // (bad)
+  .byte  255,201                             // dec           %ecx
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  191,255,255,255,178                 // mov           $0xb2ffffff,%edi
+  .byte  188,255,255,255,175                 // mov           $0xafffffff,%esp
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  255,165,255,255,255,157             // jmpq          *-0x62000001(%rbp)
+  .byte  255,162,255,255,255,154             // jmpq          *-0x65000001(%rdx)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  255,149,255,255,255,141             // callq         *-0x72000001(%rbp)
+  .byte  255,146,255,255,255,138             // callq         *-0x75000001(%rdx)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // .byte         0xff
@@ -16775,7 +16985,7 @@
   .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
   .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,55                              // jne           4c85 <_sk_store_f32_avx+0x6d>
+  .byte  117,55                              // jne           50d1 <_sk_store_f32_avx+0x6d>
   .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -16788,22 +16998,22 @@
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            4c81 <_sk_store_f32_avx+0x69>
+  .byte  116,240                             // je            50cd <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            4c81 <_sk_store_f32_avx+0x69>
+  .byte  114,227                             // jb            50cd <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
-  .byte  116,218                             // je            4c81 <_sk_store_f32_avx+0x69>
+  .byte  116,218                             // je            50cd <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            4c81 <_sk_store_f32_avx+0x69>
+  .byte  114,205                             // jb            50cd <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  .byte  116,195                             // je            4c81 <_sk_store_f32_avx+0x69>
+  .byte  116,195                             // je            50cd <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,181                             // jb            4c81 <_sk_store_f32_avx+0x69>
+  .byte  114,181                             // jb            50cd <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  .byte  235,171                             // jmp           4c81 <_sk_store_f32_avx+0x69>
+  .byte  235,171                             // jmp           50cd <_sk_store_f32_avx+0x69>
 
 HIDDEN _sk_clamp_x_avx
 .globl _sk_clamp_x_avx
@@ -17131,7 +17341,7 @@
   .byte  196,226,125,24,88,28                // vbroadcastss  0x1c(%rax),%ymm3
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  15,132,146,0,0,0                    // je            5239 <_sk_linear_gradient_avx+0xb8>
+  .byte  15,132,146,0,0,0                    // je            5685 <_sk_linear_gradient_avx+0xb8>
   .byte  72,139,64,8                         // mov           0x8(%rax),%rax
   .byte  72,131,192,32                       // add           $0x20,%rax
   .byte  196,65,28,87,228                    // vxorps        %ymm12,%ymm12,%ymm12
@@ -17158,8 +17368,8 @@
   .byte  196,227,13,74,219,208               // vblendvps     %ymm13,%ymm3,%ymm14,%ymm3
   .byte  72,131,192,36                       // add           $0x24,%rax
   .byte  73,255,200                          // dec           %r8
-  .byte  117,140                             // jne           51c3 <_sk_linear_gradient_avx+0x42>
-  .byte  235,20                              // jmp           524d <_sk_linear_gradient_avx+0xcc>
+  .byte  117,140                             // jne           560f <_sk_linear_gradient_avx+0x42>
+  .byte  235,20                              // jmp           5699 <_sk_linear_gradient_avx+0xcc>
   .byte  196,65,36,87,219                    // vxorps        %ymm11,%ymm11,%ymm11
   .byte  196,65,44,87,210                    // vxorps        %ymm10,%ymm10,%ymm10
   .byte  196,65,52,87,201                    // vxorps        %ymm9,%ymm9,%ymm9
@@ -20811,43 +21021,70 @@
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  243,15,111,4,248                    // movdqu        (%rax,%rdi,8),%xmm0
   .byte  243,15,111,76,248,16                // movdqu        0x10(%rax,%rdi,8),%xmm1
-  .byte  102,68,15,111,192                   // movdqa        %xmm0,%xmm8
-  .byte  102,68,15,97,193                    // punpcklwd     %xmm1,%xmm8
+  .byte  102,68,15,111,200                   // movdqa        %xmm0,%xmm9
+  .byte  102,68,15,97,201                    // punpcklwd     %xmm1,%xmm9
   .byte  102,15,105,193                      // punpckhwd     %xmm1,%xmm0
-  .byte  102,65,15,111,200                   // movdqa        %xmm8,%xmm1
-  .byte  102,15,97,200                       // punpcklwd     %xmm0,%xmm1
-  .byte  102,68,15,105,192                   // punpckhwd     %xmm0,%xmm8
-  .byte  184,0,4,0,4                         // mov           $0x4000400,%eax
+  .byte  102,69,15,111,225                   // movdqa        %xmm9,%xmm12
+  .byte  102,68,15,97,224                    // punpcklwd     %xmm0,%xmm12
+  .byte  102,68,15,105,200                   // punpckhwd     %xmm0,%xmm9
+  .byte  102,69,15,56,51,236                 // pmovzxwd      %xmm12,%xmm13
+  .byte  184,0,128,0,0                       // mov           $0x8000,%eax
+  .byte  102,15,110,192                      // movd          %eax,%xmm0
+  .byte  102,68,15,112,192,0                 // pshufd        $0x0,%xmm0,%xmm8
+  .byte  102,65,15,111,213                   // movdqa        %xmm13,%xmm2
+  .byte  102,65,15,219,208                   // pand          %xmm8,%xmm2
+  .byte  184,0,124,0,0                       // mov           $0x7c00,%eax
   .byte  102,15,110,192                      // movd          %eax,%xmm0
   .byte  102,15,112,216,0                    // pshufd        $0x0,%xmm0,%xmm3
-  .byte  102,15,111,195                      // movdqa        %xmm3,%xmm0
-  .byte  102,15,101,193                      // pcmpgtw       %xmm1,%xmm0
-  .byte  102,15,223,193                      // pandn         %xmm1,%xmm0
-  .byte  102,15,56,51,192                    // pmovzxwd      %xmm0,%xmm0
-  .byte  102,15,114,240,13                   // pslld         $0xd,%xmm0
-  .byte  184,0,0,128,119                     // mov           $0x77800000,%eax
-  .byte  102,15,110,208                      // movd          %eax,%xmm2
-  .byte  102,68,15,112,202,0                 // pshufd        $0x0,%xmm2,%xmm9
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  102,15,112,201,78                   // pshufd        $0x4e,%xmm1,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,15,101,209                      // pcmpgtw       %xmm1,%xmm2
-  .byte  102,15,223,209                      // pandn         %xmm1,%xmm2
-  .byte  102,15,56,51,202                    // pmovzxwd      %xmm2,%xmm1
-  .byte  102,15,114,241,13                   // pslld         $0xd,%xmm1
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,65,15,101,208                   // pcmpgtw       %xmm8,%xmm2
-  .byte  102,65,15,223,208                   // pandn         %xmm8,%xmm2
-  .byte  102,15,56,51,210                    // pmovzxwd      %xmm2,%xmm2
-  .byte  102,15,114,242,13                   // pslld         $0xd,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  102,69,15,112,192,78                // pshufd        $0x4e,%xmm8,%xmm8
-  .byte  102,65,15,101,216                   // pcmpgtw       %xmm8,%xmm3
+  .byte  102,65,15,111,197                   // movdqa        %xmm13,%xmm0
+  .byte  102,15,219,195                      // pand          %xmm3,%xmm0
+  .byte  102,68,15,239,234                   // pxor          %xmm2,%xmm13
+  .byte  102,69,15,239,210                   // pxor          %xmm10,%xmm10
+  .byte  102,15,114,242,16                   // pslld         $0x10,%xmm2
+  .byte  102,65,15,114,245,13                // pslld         $0xd,%xmm13
+  .byte  184,0,0,0,56                        // mov           $0x38000000,%eax
+  .byte  102,15,110,200                      // movd          %eax,%xmm1
+  .byte  102,68,15,112,217,0                 // pshufd        $0x0,%xmm1,%xmm11
+  .byte  102,65,15,254,211                   // paddd         %xmm11,%xmm2
+  .byte  102,65,15,254,213                   // paddd         %xmm13,%xmm2
+  .byte  102,65,15,118,194                   // pcmpeqd       %xmm10,%xmm0
+  .byte  102,15,223,194                      // pandn         %xmm2,%xmm0
+  .byte  102,65,15,115,220,8                 // psrldq        $0x8,%xmm12
+  .byte  102,69,15,56,51,228                 // pmovzxwd      %xmm12,%xmm12
+  .byte  102,65,15,111,212                   // movdqa        %xmm12,%xmm2
+  .byte  102,65,15,219,208                   // pand          %xmm8,%xmm2
+  .byte  102,65,15,111,204                   // movdqa        %xmm12,%xmm1
+  .byte  102,15,219,203                      // pand          %xmm3,%xmm1
+  .byte  102,68,15,239,226                   // pxor          %xmm2,%xmm12
+  .byte  102,15,114,242,16                   // pslld         $0x10,%xmm2
+  .byte  102,65,15,114,244,13                // pslld         $0xd,%xmm12
+  .byte  102,65,15,254,211                   // paddd         %xmm11,%xmm2
+  .byte  102,65,15,254,212                   // paddd         %xmm12,%xmm2
+  .byte  102,65,15,118,202                   // pcmpeqd       %xmm10,%xmm1
+  .byte  102,15,223,202                      // pandn         %xmm2,%xmm1
+  .byte  102,69,15,56,51,225                 // pmovzxwd      %xmm9,%xmm12
+  .byte  102,69,15,111,236                   // movdqa        %xmm12,%xmm13
+  .byte  102,69,15,219,232                   // pand          %xmm8,%xmm13
+  .byte  102,65,15,111,212                   // movdqa        %xmm12,%xmm2
+  .byte  102,15,219,211                      // pand          %xmm3,%xmm2
+  .byte  102,69,15,239,229                   // pxor          %xmm13,%xmm12
+  .byte  102,65,15,114,245,16                // pslld         $0x10,%xmm13
+  .byte  102,65,15,114,244,13                // pslld         $0xd,%xmm12
+  .byte  102,69,15,254,235                   // paddd         %xmm11,%xmm13
+  .byte  102,69,15,254,236                   // paddd         %xmm12,%xmm13
+  .byte  102,65,15,118,210                   // pcmpeqd       %xmm10,%xmm2
+  .byte  102,65,15,223,213                   // pandn         %xmm13,%xmm2
+  .byte  102,65,15,115,217,8                 // psrldq        $0x8,%xmm9
+  .byte  102,69,15,56,51,201                 // pmovzxwd      %xmm9,%xmm9
+  .byte  102,69,15,219,193                   // pand          %xmm9,%xmm8
+  .byte  102,65,15,219,217                   // pand          %xmm9,%xmm3
+  .byte  102,69,15,239,200                   // pxor          %xmm8,%xmm9
+  .byte  102,65,15,114,240,16                // pslld         $0x10,%xmm8
+  .byte  102,65,15,114,241,13                // pslld         $0xd,%xmm9
+  .byte  102,69,15,254,195                   // paddd         %xmm11,%xmm8
+  .byte  102,69,15,254,193                   // paddd         %xmm9,%xmm8
+  .byte  102,65,15,118,218                   // pcmpeqd       %xmm10,%xmm3
   .byte  102,65,15,223,216                   // pandn         %xmm8,%xmm3
-  .byte  102,15,56,51,219                    // pmovzxwd      %xmm3,%xmm3
-  .byte  102,15,114,243,13                   // pslld         $0xd,%xmm3
-  .byte  65,15,89,217                        // mulps         %xmm9,%xmm3
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
@@ -20875,43 +21112,70 @@
   .byte  243,65,15,126,4,193                 // movq          (%r9,%rax,8),%xmm0
   .byte  243,67,15,126,20,193                // movq          (%r9,%r8,8),%xmm2
   .byte  102,15,108,208                      // punpcklqdq    %xmm0,%xmm2
-  .byte  102,68,15,111,194                   // movdqa        %xmm2,%xmm8
-  .byte  102,68,15,97,193                    // punpcklwd     %xmm1,%xmm8
+  .byte  102,68,15,111,202                   // movdqa        %xmm2,%xmm9
+  .byte  102,68,15,97,201                    // punpcklwd     %xmm1,%xmm9
   .byte  102,15,105,209                      // punpckhwd     %xmm1,%xmm2
-  .byte  102,65,15,111,200                   // movdqa        %xmm8,%xmm1
-  .byte  102,15,97,202                       // punpcklwd     %xmm2,%xmm1
-  .byte  102,68,15,105,194                   // punpckhwd     %xmm2,%xmm8
-  .byte  184,0,4,0,4                         // mov           $0x4000400,%eax
+  .byte  102,69,15,111,225                   // movdqa        %xmm9,%xmm12
+  .byte  102,68,15,97,226                    // punpcklwd     %xmm2,%xmm12
+  .byte  102,68,15,105,202                   // punpckhwd     %xmm2,%xmm9
+  .byte  102,69,15,56,51,236                 // pmovzxwd      %xmm12,%xmm13
+  .byte  184,0,128,0,0                       // mov           $0x8000,%eax
+  .byte  102,15,110,192                      // movd          %eax,%xmm0
+  .byte  102,68,15,112,192,0                 // pshufd        $0x0,%xmm0,%xmm8
+  .byte  102,65,15,111,213                   // movdqa        %xmm13,%xmm2
+  .byte  102,65,15,219,208                   // pand          %xmm8,%xmm2
+  .byte  184,0,124,0,0                       // mov           $0x7c00,%eax
   .byte  102,15,110,192                      // movd          %eax,%xmm0
   .byte  102,15,112,216,0                    // pshufd        $0x0,%xmm0,%xmm3
-  .byte  102,15,111,195                      // movdqa        %xmm3,%xmm0
-  .byte  102,15,101,193                      // pcmpgtw       %xmm1,%xmm0
-  .byte  102,15,223,193                      // pandn         %xmm1,%xmm0
-  .byte  102,15,56,51,192                    // pmovzxwd      %xmm0,%xmm0
-  .byte  102,15,114,240,13                   // pslld         $0xd,%xmm0
-  .byte  184,0,0,128,119                     // mov           $0x77800000,%eax
-  .byte  102,15,110,208                      // movd          %eax,%xmm2
-  .byte  102,68,15,112,202,0                 // pshufd        $0x0,%xmm2,%xmm9
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  102,15,112,201,78                   // pshufd        $0x4e,%xmm1,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,15,101,209                      // pcmpgtw       %xmm1,%xmm2
-  .byte  102,15,223,209                      // pandn         %xmm1,%xmm2
-  .byte  102,15,56,51,202                    // pmovzxwd      %xmm2,%xmm1
-  .byte  102,15,114,241,13                   // pslld         $0xd,%xmm1
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,65,15,101,208                   // pcmpgtw       %xmm8,%xmm2
-  .byte  102,65,15,223,208                   // pandn         %xmm8,%xmm2
-  .byte  102,15,56,51,210                    // pmovzxwd      %xmm2,%xmm2
-  .byte  102,15,114,242,13                   // pslld         $0xd,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  102,69,15,112,192,78                // pshufd        $0x4e,%xmm8,%xmm8
-  .byte  102,65,15,101,216                   // pcmpgtw       %xmm8,%xmm3
+  .byte  102,65,15,111,197                   // movdqa        %xmm13,%xmm0
+  .byte  102,15,219,195                      // pand          %xmm3,%xmm0
+  .byte  102,68,15,239,234                   // pxor          %xmm2,%xmm13
+  .byte  102,69,15,239,210                   // pxor          %xmm10,%xmm10
+  .byte  102,15,114,242,16                   // pslld         $0x10,%xmm2
+  .byte  102,65,15,114,245,13                // pslld         $0xd,%xmm13
+  .byte  184,0,0,0,56                        // mov           $0x38000000,%eax
+  .byte  102,15,110,200                      // movd          %eax,%xmm1
+  .byte  102,68,15,112,217,0                 // pshufd        $0x0,%xmm1,%xmm11
+  .byte  102,65,15,254,211                   // paddd         %xmm11,%xmm2
+  .byte  102,65,15,254,213                   // paddd         %xmm13,%xmm2
+  .byte  102,65,15,118,194                   // pcmpeqd       %xmm10,%xmm0
+  .byte  102,15,223,194                      // pandn         %xmm2,%xmm0
+  .byte  102,65,15,115,220,8                 // psrldq        $0x8,%xmm12
+  .byte  102,69,15,56,51,228                 // pmovzxwd      %xmm12,%xmm12
+  .byte  102,65,15,111,212                   // movdqa        %xmm12,%xmm2
+  .byte  102,65,15,219,208                   // pand          %xmm8,%xmm2
+  .byte  102,65,15,111,204                   // movdqa        %xmm12,%xmm1
+  .byte  102,15,219,203                      // pand          %xmm3,%xmm1
+  .byte  102,68,15,239,226                   // pxor          %xmm2,%xmm12
+  .byte  102,15,114,242,16                   // pslld         $0x10,%xmm2
+  .byte  102,65,15,114,244,13                // pslld         $0xd,%xmm12
+  .byte  102,65,15,254,211                   // paddd         %xmm11,%xmm2
+  .byte  102,65,15,254,212                   // paddd         %xmm12,%xmm2
+  .byte  102,65,15,118,202                   // pcmpeqd       %xmm10,%xmm1
+  .byte  102,15,223,202                      // pandn         %xmm2,%xmm1
+  .byte  102,69,15,56,51,225                 // pmovzxwd      %xmm9,%xmm12
+  .byte  102,69,15,111,236                   // movdqa        %xmm12,%xmm13
+  .byte  102,69,15,219,232                   // pand          %xmm8,%xmm13
+  .byte  102,65,15,111,212                   // movdqa        %xmm12,%xmm2
+  .byte  102,15,219,211                      // pand          %xmm3,%xmm2
+  .byte  102,69,15,239,229                   // pxor          %xmm13,%xmm12
+  .byte  102,65,15,114,245,16                // pslld         $0x10,%xmm13
+  .byte  102,65,15,114,244,13                // pslld         $0xd,%xmm12
+  .byte  102,69,15,254,235                   // paddd         %xmm11,%xmm13
+  .byte  102,69,15,254,236                   // paddd         %xmm12,%xmm13
+  .byte  102,65,15,118,210                   // pcmpeqd       %xmm10,%xmm2
+  .byte  102,65,15,223,213                   // pandn         %xmm13,%xmm2
+  .byte  102,65,15,115,217,8                 // psrldq        $0x8,%xmm9
+  .byte  102,69,15,56,51,201                 // pmovzxwd      %xmm9,%xmm9
+  .byte  102,69,15,219,193                   // pand          %xmm9,%xmm8
+  .byte  102,65,15,219,217                   // pand          %xmm9,%xmm3
+  .byte  102,69,15,239,200                   // pxor          %xmm8,%xmm9
+  .byte  102,65,15,114,240,16                // pslld         $0x10,%xmm8
+  .byte  102,65,15,114,241,13                // pslld         $0xd,%xmm9
+  .byte  102,69,15,254,195                   // paddd         %xmm11,%xmm8
+  .byte  102,69,15,254,193                   // paddd         %xmm9,%xmm8
+  .byte  102,65,15,118,218                   // pcmpeqd       %xmm10,%xmm3
   .byte  102,65,15,223,216                   // pandn         %xmm8,%xmm3
-  .byte  102,15,56,51,219                    // pmovzxwd      %xmm3,%xmm3
-  .byte  102,15,114,243,13                   // pslld         $0xd,%xmm3
-  .byte  65,15,89,217                        // mulps         %xmm9,%xmm3
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
@@ -20921,30 +21185,68 @@
 _sk_store_f16_sse41:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  185,0,0,128,7                       // mov           $0x7800000,%ecx
+  .byte  185,0,0,0,128                       // mov           $0x80000000,%ecx
   .byte  102,68,15,110,193                   // movd          %ecx,%xmm8
   .byte  102,69,15,112,200,0                 // pshufd        $0x0,%xmm8,%xmm9
-  .byte  102,69,15,111,193                   // movdqa        %xmm9,%xmm8
-  .byte  68,15,89,192                        // mulps         %xmm0,%xmm8
-  .byte  102,65,15,114,208,13                // psrld         $0xd,%xmm8
+  .byte  102,69,15,111,225                   // movdqa        %xmm9,%xmm12
+  .byte  102,68,15,219,224                   // pand          %xmm0,%xmm12
+  .byte  102,68,15,111,192                   // movdqa        %xmm0,%xmm8
+  .byte  102,69,15,239,196                   // pxor          %xmm12,%xmm8
+  .byte  185,0,0,128,56                      // mov           $0x38800000,%ecx
+  .byte  102,68,15,110,209                   // movd          %ecx,%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  102,65,15,114,212,16                // psrld         $0x10,%xmm12
+  .byte  102,69,15,111,232                   // movdqa        %xmm8,%xmm13
+  .byte  102,65,15,114,213,13                // psrld         $0xd,%xmm13
+  .byte  185,0,192,1,0                       // mov           $0x1c000,%ecx
+  .byte  102,68,15,110,217                   // movd          %ecx,%xmm11
+  .byte  102,69,15,112,219,0                 // pshufd        $0x0,%xmm11,%xmm11
+  .byte  102,69,15,250,227                   // psubd         %xmm11,%xmm12
+  .byte  102,69,15,254,229                   // paddd         %xmm13,%xmm12
+  .byte  69,15,194,194,5                     // cmpnltps      %xmm10,%xmm8
+  .byte  69,15,84,196                        // andps         %xmm12,%xmm8
   .byte  102,69,15,56,43,192                 // packusdw      %xmm8,%xmm8
-  .byte  102,69,15,111,209                   // movdqa        %xmm9,%xmm10
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  102,65,15,114,210,13                // psrld         $0xd,%xmm10
-  .byte  102,69,15,56,43,210                 // packusdw      %xmm10,%xmm10
-  .byte  102,69,15,111,217                   // movdqa        %xmm9,%xmm11
-  .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
-  .byte  102,65,15,114,211,13                // psrld         $0xd,%xmm11
-  .byte  102,69,15,56,43,219                 // packusdw      %xmm11,%xmm11
-  .byte  68,15,89,203                        // mulps         %xmm3,%xmm9
-  .byte  102,65,15,114,209,13                // psrld         $0xd,%xmm9
-  .byte  102,69,15,56,43,201                 // packusdw      %xmm9,%xmm9
-  .byte  102,69,15,97,194                    // punpcklwd     %xmm10,%xmm8
-  .byte  102,69,15,97,217                    // punpcklwd     %xmm9,%xmm11
+  .byte  102,69,15,111,233                   // movdqa        %xmm9,%xmm13
+  .byte  102,68,15,219,233                   // pand          %xmm1,%xmm13
+  .byte  102,68,15,111,225                   // movdqa        %xmm1,%xmm12
+  .byte  102,69,15,239,229                   // pxor          %xmm13,%xmm12
+  .byte  102,65,15,114,213,16                // psrld         $0x10,%xmm13
+  .byte  102,69,15,111,244                   // movdqa        %xmm12,%xmm14
+  .byte  102,65,15,114,214,13                // psrld         $0xd,%xmm14
+  .byte  102,69,15,250,235                   // psubd         %xmm11,%xmm13
+  .byte  102,69,15,254,238                   // paddd         %xmm14,%xmm13
+  .byte  69,15,194,226,5                     // cmpnltps      %xmm10,%xmm12
+  .byte  69,15,84,229                        // andps         %xmm13,%xmm12
+  .byte  102,69,15,56,43,228                 // packusdw      %xmm12,%xmm12
+  .byte  102,69,15,111,241                   // movdqa        %xmm9,%xmm14
+  .byte  102,68,15,219,242                   // pand          %xmm2,%xmm14
+  .byte  102,68,15,111,234                   // movdqa        %xmm2,%xmm13
+  .byte  102,69,15,239,238                   // pxor          %xmm14,%xmm13
+  .byte  102,65,15,114,214,16                // psrld         $0x10,%xmm14
+  .byte  102,69,15,111,253                   // movdqa        %xmm13,%xmm15
+  .byte  102,65,15,114,215,13                // psrld         $0xd,%xmm15
+  .byte  102,69,15,250,243                   // psubd         %xmm11,%xmm14
+  .byte  102,69,15,254,247                   // paddd         %xmm15,%xmm14
+  .byte  69,15,194,234,5                     // cmpnltps      %xmm10,%xmm13
+  .byte  69,15,84,238                        // andps         %xmm14,%xmm13
+  .byte  102,69,15,56,43,237                 // packusdw      %xmm13,%xmm13
+  .byte  102,68,15,219,203                   // pand          %xmm3,%xmm9
+  .byte  102,68,15,111,243                   // movdqa        %xmm3,%xmm14
+  .byte  102,69,15,239,241                   // pxor          %xmm9,%xmm14
+  .byte  102,65,15,114,209,16                // psrld         $0x10,%xmm9
+  .byte  102,69,15,111,254                   // movdqa        %xmm14,%xmm15
+  .byte  102,65,15,114,215,13                // psrld         $0xd,%xmm15
+  .byte  102,69,15,250,203                   // psubd         %xmm11,%xmm9
+  .byte  102,69,15,254,207                   // paddd         %xmm15,%xmm9
+  .byte  69,15,194,242,5                     // cmpnltps      %xmm10,%xmm14
+  .byte  69,15,84,241                        // andps         %xmm9,%xmm14
+  .byte  102,69,15,56,43,246                 // packusdw      %xmm14,%xmm14
+  .byte  102,69,15,97,196                    // punpcklwd     %xmm12,%xmm8
+  .byte  102,69,15,97,238                    // punpcklwd     %xmm14,%xmm13
   .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
-  .byte  102,69,15,98,203                    // punpckldq     %xmm11,%xmm9
+  .byte  102,69,15,98,205                    // punpckldq     %xmm13,%xmm9
   .byte  243,68,15,127,12,248                // movdqu        %xmm9,(%rax,%rdi,8)
-  .byte  102,69,15,106,195                   // punpckhdq     %xmm11,%xmm8
+  .byte  102,69,15,106,197                   // punpckhdq     %xmm13,%xmm8
   .byte  243,68,15,127,68,248,16             // movdqu        %xmm8,0x10(%rax,%rdi,8)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -21523,7 +21825,7 @@
   .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
   .byte  72,139,8                            // mov           (%rax),%rcx
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,132,254,0,0,0                    // je            38c9 <_sk_linear_gradient_sse41+0x138>
+  .byte  15,132,254,0,0,0                    // je            3abe <_sk_linear_gradient_sse41+0x138>
   .byte  15,41,100,36,168                    // movaps        %xmm4,-0x58(%rsp)
   .byte  15,41,108,36,184                    // movaps        %xmm5,-0x48(%rsp)
   .byte  15,41,116,36,200                    // movaps        %xmm6,-0x38(%rsp)
@@ -21573,12 +21875,12 @@
   .byte  15,40,196                           // movaps        %xmm4,%xmm0
   .byte  72,131,192,36                       // add           $0x24,%rax
   .byte  72,255,201                          // dec           %rcx
-  .byte  15,133,65,255,255,255               // jne           37f4 <_sk_linear_gradient_sse41+0x63>
+  .byte  15,133,65,255,255,255               // jne           39e9 <_sk_linear_gradient_sse41+0x63>
   .byte  15,40,124,36,216                    // movaps        -0x28(%rsp),%xmm7
   .byte  15,40,116,36,200                    // movaps        -0x38(%rsp),%xmm6
   .byte  15,40,108,36,184                    // movaps        -0x48(%rsp),%xmm5
   .byte  15,40,100,36,168                    // movaps        -0x58(%rsp),%xmm4
-  .byte  235,13                              // jmp           38d6 <_sk_linear_gradient_sse41+0x145>
+  .byte  235,13                              // jmp           3acb <_sk_linear_gradient_sse41+0x145>
   .byte  15,87,201                           // xorps         %xmm1,%xmm1
   .byte  15,87,210                           // xorps         %xmm2,%xmm2
   .byte  15,87,219                           // xorps         %xmm3,%xmm3
@@ -25440,41 +25742,69 @@
   .byte  102,68,15,111,192                   // movdqa        %xmm0,%xmm8
   .byte  102,68,15,97,193                    // punpcklwd     %xmm1,%xmm8
   .byte  102,15,105,193                      // punpckhwd     %xmm1,%xmm0
-  .byte  102,65,15,111,200                   // movdqa        %xmm8,%xmm1
-  .byte  102,15,97,200                       // punpcklwd     %xmm0,%xmm1
+  .byte  102,69,15,111,224                   // movdqa        %xmm8,%xmm12
+  .byte  102,68,15,97,224                    // punpcklwd     %xmm0,%xmm12
   .byte  102,68,15,105,192                   // punpckhwd     %xmm0,%xmm8
-  .byte  184,0,4,0,4                         // mov           $0x4000400,%eax
+  .byte  102,69,15,239,201                   // pxor          %xmm9,%xmm9
+  .byte  102,69,15,111,236                   // movdqa        %xmm12,%xmm13
+  .byte  102,69,15,97,233                    // punpcklwd     %xmm9,%xmm13
+  .byte  184,0,128,0,0                       // mov           $0x8000,%eax
+  .byte  102,15,110,192                      // movd          %eax,%xmm0
+  .byte  102,68,15,112,208,0                 // pshufd        $0x0,%xmm0,%xmm10
+  .byte  102,65,15,111,205                   // movdqa        %xmm13,%xmm1
+  .byte  102,65,15,219,202                   // pand          %xmm10,%xmm1
+  .byte  184,0,124,0,0                       // mov           $0x7c00,%eax
   .byte  102,15,110,192                      // movd          %eax,%xmm0
   .byte  102,15,112,216,0                    // pshufd        $0x0,%xmm0,%xmm3
-  .byte  102,15,111,195                      // movdqa        %xmm3,%xmm0
-  .byte  102,15,101,193                      // pcmpgtw       %xmm1,%xmm0
-  .byte  102,15,223,193                      // pandn         %xmm1,%xmm0
-  .byte  102,69,15,239,201                   // pxor          %xmm9,%xmm9
-  .byte  102,65,15,97,193                    // punpcklwd     %xmm9,%xmm0
-  .byte  102,15,114,240,13                   // pslld         $0xd,%xmm0
-  .byte  184,0,0,128,119                     // mov           $0x77800000,%eax
+  .byte  102,65,15,111,197                   // movdqa        %xmm13,%xmm0
+  .byte  102,15,219,195                      // pand          %xmm3,%xmm0
+  .byte  102,68,15,239,233                   // pxor          %xmm1,%xmm13
+  .byte  102,15,114,241,16                   // pslld         $0x10,%xmm1
+  .byte  102,65,15,114,245,13                // pslld         $0xd,%xmm13
+  .byte  184,0,0,0,56                        // mov           $0x38000000,%eax
   .byte  102,15,110,208                      // movd          %eax,%xmm2
-  .byte  102,68,15,112,210,0                 // pshufd        $0x0,%xmm2,%xmm10
-  .byte  65,15,89,194                        // mulps         %xmm10,%xmm0
-  .byte  102,15,112,209,78                   // pshufd        $0x4e,%xmm1,%xmm2
-  .byte  102,15,111,203                      // movdqa        %xmm3,%xmm1
-  .byte  102,15,101,202                      // pcmpgtw       %xmm2,%xmm1
+  .byte  102,68,15,112,218,0                 // pshufd        $0x0,%xmm2,%xmm11
+  .byte  102,65,15,254,203                   // paddd         %xmm11,%xmm1
+  .byte  102,65,15,254,205                   // paddd         %xmm13,%xmm1
+  .byte  102,65,15,118,193                   // pcmpeqd       %xmm9,%xmm0
+  .byte  102,15,223,193                      // pandn         %xmm1,%xmm0
+  .byte  102,65,15,115,220,8                 // psrldq        $0x8,%xmm12
+  .byte  102,69,15,97,225                    // punpcklwd     %xmm9,%xmm12
+  .byte  102,65,15,111,212                   // movdqa        %xmm12,%xmm2
+  .byte  102,65,15,219,210                   // pand          %xmm10,%xmm2
+  .byte  102,65,15,111,204                   // movdqa        %xmm12,%xmm1
+  .byte  102,15,219,203                      // pand          %xmm3,%xmm1
+  .byte  102,68,15,239,226                   // pxor          %xmm2,%xmm12
+  .byte  102,15,114,242,16                   // pslld         $0x10,%xmm2
+  .byte  102,65,15,114,244,13                // pslld         $0xd,%xmm12
+  .byte  102,65,15,254,211                   // paddd         %xmm11,%xmm2
+  .byte  102,65,15,254,212                   // paddd         %xmm12,%xmm2
+  .byte  102,65,15,118,201                   // pcmpeqd       %xmm9,%xmm1
   .byte  102,15,223,202                      // pandn         %xmm2,%xmm1
-  .byte  102,65,15,97,201                    // punpcklwd     %xmm9,%xmm1
-  .byte  102,15,114,241,13                   // pslld         $0xd,%xmm1
-  .byte  65,15,89,202                        // mulps         %xmm10,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,65,15,101,208                   // pcmpgtw       %xmm8,%xmm2
-  .byte  102,65,15,223,208                   // pandn         %xmm8,%xmm2
-  .byte  102,65,15,97,209                    // punpcklwd     %xmm9,%xmm2
-  .byte  102,15,114,242,13                   // pslld         $0xd,%xmm2
-  .byte  65,15,89,210                        // mulps         %xmm10,%xmm2
-  .byte  102,69,15,112,192,78                // pshufd        $0x4e,%xmm8,%xmm8
-  .byte  102,65,15,101,216                   // pcmpgtw       %xmm8,%xmm3
-  .byte  102,65,15,223,216                   // pandn         %xmm8,%xmm3
-  .byte  102,65,15,97,217                    // punpcklwd     %xmm9,%xmm3
-  .byte  102,15,114,243,13                   // pslld         $0xd,%xmm3
-  .byte  65,15,89,218                        // mulps         %xmm10,%xmm3
+  .byte  102,69,15,111,224                   // movdqa        %xmm8,%xmm12
+  .byte  102,69,15,97,225                    // punpcklwd     %xmm9,%xmm12
+  .byte  102,69,15,111,236                   // movdqa        %xmm12,%xmm13
+  .byte  102,69,15,219,234                   // pand          %xmm10,%xmm13
+  .byte  102,65,15,111,212                   // movdqa        %xmm12,%xmm2
+  .byte  102,15,219,211                      // pand          %xmm3,%xmm2
+  .byte  102,69,15,239,229                   // pxor          %xmm13,%xmm12
+  .byte  102,65,15,114,245,16                // pslld         $0x10,%xmm13
+  .byte  102,65,15,114,244,13                // pslld         $0xd,%xmm12
+  .byte  102,69,15,254,235                   // paddd         %xmm11,%xmm13
+  .byte  102,69,15,254,236                   // paddd         %xmm12,%xmm13
+  .byte  102,65,15,118,209                   // pcmpeqd       %xmm9,%xmm2
+  .byte  102,65,15,223,213                   // pandn         %xmm13,%xmm2
+  .byte  102,65,15,115,216,8                 // psrldq        $0x8,%xmm8
+  .byte  102,69,15,97,193                    // punpcklwd     %xmm9,%xmm8
+  .byte  102,69,15,219,208                   // pand          %xmm8,%xmm10
+  .byte  102,65,15,219,216                   // pand          %xmm8,%xmm3
+  .byte  102,69,15,239,194                   // pxor          %xmm10,%xmm8
+  .byte  102,65,15,114,242,16                // pslld         $0x10,%xmm10
+  .byte  102,65,15,114,240,13                // pslld         $0xd,%xmm8
+  .byte  102,69,15,254,211                   // paddd         %xmm11,%xmm10
+  .byte  102,69,15,254,208                   // paddd         %xmm8,%xmm10
+  .byte  102,65,15,118,217                   // pcmpeqd       %xmm9,%xmm3
+  .byte  102,65,15,223,218                   // pandn         %xmm10,%xmm3
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
@@ -25511,41 +25841,69 @@
   .byte  102,68,15,111,193                   // movdqa        %xmm1,%xmm8
   .byte  102,68,15,97,194                    // punpcklwd     %xmm2,%xmm8
   .byte  102,15,105,202                      // punpckhwd     %xmm2,%xmm1
-  .byte  102,65,15,111,208                   // movdqa        %xmm8,%xmm2
-  .byte  102,15,97,209                       // punpcklwd     %xmm1,%xmm2
+  .byte  102,69,15,111,224                   // movdqa        %xmm8,%xmm12
+  .byte  102,68,15,97,225                    // punpcklwd     %xmm1,%xmm12
   .byte  102,68,15,105,193                   // punpckhwd     %xmm1,%xmm8
-  .byte  184,0,4,0,4                         // mov           $0x4000400,%eax
+  .byte  102,69,15,239,201                   // pxor          %xmm9,%xmm9
+  .byte  102,69,15,111,236                   // movdqa        %xmm12,%xmm13
+  .byte  102,69,15,97,233                    // punpcklwd     %xmm9,%xmm13
+  .byte  184,0,128,0,0                       // mov           $0x8000,%eax
+  .byte  102,15,110,192                      // movd          %eax,%xmm0
+  .byte  102,68,15,112,208,0                 // pshufd        $0x0,%xmm0,%xmm10
+  .byte  102,65,15,111,205                   // movdqa        %xmm13,%xmm1
+  .byte  102,65,15,219,202                   // pand          %xmm10,%xmm1
+  .byte  184,0,124,0,0                       // mov           $0x7c00,%eax
   .byte  102,15,110,192                      // movd          %eax,%xmm0
   .byte  102,15,112,216,0                    // pshufd        $0x0,%xmm0,%xmm3
-  .byte  102,15,111,195                      // movdqa        %xmm3,%xmm0
-  .byte  102,15,101,194                      // pcmpgtw       %xmm2,%xmm0
-  .byte  102,15,223,194                      // pandn         %xmm2,%xmm0
-  .byte  102,69,15,239,201                   // pxor          %xmm9,%xmm9
-  .byte  102,65,15,97,193                    // punpcklwd     %xmm9,%xmm0
-  .byte  102,15,114,240,13                   // pslld         $0xd,%xmm0
-  .byte  184,0,0,128,119                     // mov           $0x77800000,%eax
-  .byte  102,15,110,200                      // movd          %eax,%xmm1
-  .byte  102,68,15,112,209,0                 // pshufd        $0x0,%xmm1,%xmm10
-  .byte  65,15,89,194                        // mulps         %xmm10,%xmm0
-  .byte  102,15,112,210,78                   // pshufd        $0x4e,%xmm2,%xmm2
-  .byte  102,15,111,203                      // movdqa        %xmm3,%xmm1
-  .byte  102,15,101,202                      // pcmpgtw       %xmm2,%xmm1
+  .byte  102,65,15,111,197                   // movdqa        %xmm13,%xmm0
+  .byte  102,15,219,195                      // pand          %xmm3,%xmm0
+  .byte  102,68,15,239,233                   // pxor          %xmm1,%xmm13
+  .byte  102,15,114,241,16                   // pslld         $0x10,%xmm1
+  .byte  102,65,15,114,245,13                // pslld         $0xd,%xmm13
+  .byte  184,0,0,0,56                        // mov           $0x38000000,%eax
+  .byte  102,15,110,208                      // movd          %eax,%xmm2
+  .byte  102,68,15,112,218,0                 // pshufd        $0x0,%xmm2,%xmm11
+  .byte  102,65,15,254,203                   // paddd         %xmm11,%xmm1
+  .byte  102,65,15,254,205                   // paddd         %xmm13,%xmm1
+  .byte  102,65,15,118,193                   // pcmpeqd       %xmm9,%xmm0
+  .byte  102,15,223,193                      // pandn         %xmm1,%xmm0
+  .byte  102,65,15,115,220,8                 // psrldq        $0x8,%xmm12
+  .byte  102,69,15,97,225                    // punpcklwd     %xmm9,%xmm12
+  .byte  102,65,15,111,212                   // movdqa        %xmm12,%xmm2
+  .byte  102,65,15,219,210                   // pand          %xmm10,%xmm2
+  .byte  102,65,15,111,204                   // movdqa        %xmm12,%xmm1
+  .byte  102,15,219,203                      // pand          %xmm3,%xmm1
+  .byte  102,68,15,239,226                   // pxor          %xmm2,%xmm12
+  .byte  102,15,114,242,16                   // pslld         $0x10,%xmm2
+  .byte  102,65,15,114,244,13                // pslld         $0xd,%xmm12
+  .byte  102,65,15,254,211                   // paddd         %xmm11,%xmm2
+  .byte  102,65,15,254,212                   // paddd         %xmm12,%xmm2
+  .byte  102,65,15,118,201                   // pcmpeqd       %xmm9,%xmm1
   .byte  102,15,223,202                      // pandn         %xmm2,%xmm1
-  .byte  102,65,15,97,201                    // punpcklwd     %xmm9,%xmm1
-  .byte  102,15,114,241,13                   // pslld         $0xd,%xmm1
-  .byte  65,15,89,202                        // mulps         %xmm10,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,65,15,101,208                   // pcmpgtw       %xmm8,%xmm2
-  .byte  102,65,15,223,208                   // pandn         %xmm8,%xmm2
-  .byte  102,65,15,97,209                    // punpcklwd     %xmm9,%xmm2
-  .byte  102,15,114,242,13                   // pslld         $0xd,%xmm2
-  .byte  65,15,89,210                        // mulps         %xmm10,%xmm2
-  .byte  102,69,15,112,192,78                // pshufd        $0x4e,%xmm8,%xmm8
-  .byte  102,65,15,101,216                   // pcmpgtw       %xmm8,%xmm3
-  .byte  102,65,15,223,216                   // pandn         %xmm8,%xmm3
-  .byte  102,65,15,97,217                    // punpcklwd     %xmm9,%xmm3
-  .byte  102,15,114,243,13                   // pslld         $0xd,%xmm3
-  .byte  65,15,89,218                        // mulps         %xmm10,%xmm3
+  .byte  102,69,15,111,224                   // movdqa        %xmm8,%xmm12
+  .byte  102,69,15,97,225                    // punpcklwd     %xmm9,%xmm12
+  .byte  102,69,15,111,236                   // movdqa        %xmm12,%xmm13
+  .byte  102,69,15,219,234                   // pand          %xmm10,%xmm13
+  .byte  102,65,15,111,212                   // movdqa        %xmm12,%xmm2
+  .byte  102,15,219,211                      // pand          %xmm3,%xmm2
+  .byte  102,69,15,239,229                   // pxor          %xmm13,%xmm12
+  .byte  102,65,15,114,245,16                // pslld         $0x10,%xmm13
+  .byte  102,65,15,114,244,13                // pslld         $0xd,%xmm12
+  .byte  102,69,15,254,235                   // paddd         %xmm11,%xmm13
+  .byte  102,69,15,254,236                   // paddd         %xmm12,%xmm13
+  .byte  102,65,15,118,209                   // pcmpeqd       %xmm9,%xmm2
+  .byte  102,65,15,223,213                   // pandn         %xmm13,%xmm2
+  .byte  102,65,15,115,216,8                 // psrldq        $0x8,%xmm8
+  .byte  102,69,15,97,193                    // punpcklwd     %xmm9,%xmm8
+  .byte  102,69,15,219,208                   // pand          %xmm8,%xmm10
+  .byte  102,65,15,219,216                   // pand          %xmm8,%xmm3
+  .byte  102,69,15,239,194                   // pxor          %xmm10,%xmm8
+  .byte  102,65,15,114,242,16                // pslld         $0x10,%xmm10
+  .byte  102,65,15,114,240,13                // pslld         $0xd,%xmm8
+  .byte  102,69,15,254,211                   // paddd         %xmm11,%xmm10
+  .byte  102,69,15,254,208                   // paddd         %xmm8,%xmm10
+  .byte  102,65,15,118,217                   // pcmpeqd       %xmm9,%xmm3
+  .byte  102,65,15,223,218                   // pandn         %xmm10,%xmm3
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
@@ -25555,38 +25913,76 @@
 _sk_store_f16_sse2:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  185,0,0,128,7                       // mov           $0x7800000,%ecx
+  .byte  185,0,0,0,128                       // mov           $0x80000000,%ecx
   .byte  102,68,15,110,193                   // movd          %ecx,%xmm8
   .byte  102,69,15,112,200,0                 // pshufd        $0x0,%xmm8,%xmm9
-  .byte  102,69,15,111,193                   // movdqa        %xmm9,%xmm8
-  .byte  68,15,89,192                        // mulps         %xmm0,%xmm8
-  .byte  102,65,15,114,208,13                // psrld         $0xd,%xmm8
-  .byte  102,65,15,114,240,16                // pslld         $0x10,%xmm8
-  .byte  102,65,15,114,224,16                // psrad         $0x10,%xmm8
+  .byte  102,69,15,111,225                   // movdqa        %xmm9,%xmm12
+  .byte  102,68,15,219,224                   // pand          %xmm0,%xmm12
+  .byte  102,68,15,111,192                   // movdqa        %xmm0,%xmm8
+  .byte  102,69,15,239,196                   // pxor          %xmm12,%xmm8
+  .byte  185,0,0,128,56                      // mov           $0x38800000,%ecx
+  .byte  102,68,15,110,209                   // movd          %ecx,%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  102,65,15,114,212,16                // psrld         $0x10,%xmm12
+  .byte  102,69,15,111,232                   // movdqa        %xmm8,%xmm13
+  .byte  102,65,15,114,213,13                // psrld         $0xd,%xmm13
+  .byte  185,0,192,1,0                       // mov           $0x1c000,%ecx
+  .byte  102,68,15,110,217                   // movd          %ecx,%xmm11
+  .byte  102,69,15,112,219,0                 // pshufd        $0x0,%xmm11,%xmm11
+  .byte  102,69,15,250,227                   // psubd         %xmm11,%xmm12
+  .byte  102,69,15,254,229                   // paddd         %xmm13,%xmm12
+  .byte  102,65,15,114,244,16                // pslld         $0x10,%xmm12
+  .byte  102,65,15,114,228,16                // psrad         $0x10,%xmm12
+  .byte  69,15,194,194,5                     // cmpnltps      %xmm10,%xmm8
+  .byte  69,15,84,196                        // andps         %xmm12,%xmm8
   .byte  102,69,15,107,192                   // packssdw      %xmm8,%xmm8
-  .byte  102,69,15,111,209                   // movdqa        %xmm9,%xmm10
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  102,65,15,114,210,13                // psrld         $0xd,%xmm10
-  .byte  102,65,15,114,242,16                // pslld         $0x10,%xmm10
-  .byte  102,65,15,114,226,16                // psrad         $0x10,%xmm10
-  .byte  102,69,15,107,210                   // packssdw      %xmm10,%xmm10
-  .byte  102,69,15,111,217                   // movdqa        %xmm9,%xmm11
-  .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
-  .byte  102,65,15,114,211,13                // psrld         $0xd,%xmm11
-  .byte  102,65,15,114,243,16                // pslld         $0x10,%xmm11
-  .byte  102,65,15,114,227,16                // psrad         $0x10,%xmm11
-  .byte  102,69,15,107,219                   // packssdw      %xmm11,%xmm11
-  .byte  68,15,89,203                        // mulps         %xmm3,%xmm9
-  .byte  102,65,15,114,209,13                // psrld         $0xd,%xmm9
+  .byte  102,69,15,111,233                   // movdqa        %xmm9,%xmm13
+  .byte  102,68,15,219,233                   // pand          %xmm1,%xmm13
+  .byte  102,68,15,111,225                   // movdqa        %xmm1,%xmm12
+  .byte  102,69,15,239,229                   // pxor          %xmm13,%xmm12
+  .byte  102,65,15,114,213,16                // psrld         $0x10,%xmm13
+  .byte  102,69,15,111,244                   // movdqa        %xmm12,%xmm14
+  .byte  102,65,15,114,214,13                // psrld         $0xd,%xmm14
+  .byte  102,69,15,250,235                   // psubd         %xmm11,%xmm13
+  .byte  102,69,15,254,238                   // paddd         %xmm14,%xmm13
+  .byte  102,65,15,114,245,16                // pslld         $0x10,%xmm13
+  .byte  102,65,15,114,229,16                // psrad         $0x10,%xmm13
+  .byte  69,15,194,226,5                     // cmpnltps      %xmm10,%xmm12
+  .byte  69,15,84,229                        // andps         %xmm13,%xmm12
+  .byte  102,69,15,107,228                   // packssdw      %xmm12,%xmm12
+  .byte  102,69,15,111,241                   // movdqa        %xmm9,%xmm14
+  .byte  102,68,15,219,242                   // pand          %xmm2,%xmm14
+  .byte  102,68,15,111,234                   // movdqa        %xmm2,%xmm13
+  .byte  102,69,15,239,238                   // pxor          %xmm14,%xmm13
+  .byte  102,65,15,114,214,16                // psrld         $0x10,%xmm14
+  .byte  102,69,15,111,253                   // movdqa        %xmm13,%xmm15
+  .byte  102,65,15,114,215,13                // psrld         $0xd,%xmm15
+  .byte  102,69,15,250,243                   // psubd         %xmm11,%xmm14
+  .byte  102,69,15,254,247                   // paddd         %xmm15,%xmm14
+  .byte  102,65,15,114,246,16                // pslld         $0x10,%xmm14
+  .byte  102,65,15,114,230,16                // psrad         $0x10,%xmm14
+  .byte  69,15,194,234,5                     // cmpnltps      %xmm10,%xmm13
+  .byte  69,15,84,238                        // andps         %xmm14,%xmm13
+  .byte  102,69,15,107,237                   // packssdw      %xmm13,%xmm13
+  .byte  102,68,15,219,203                   // pand          %xmm3,%xmm9
+  .byte  102,68,15,111,243                   // movdqa        %xmm3,%xmm14
+  .byte  102,69,15,239,241                   // pxor          %xmm9,%xmm14
+  .byte  102,65,15,114,209,16                // psrld         $0x10,%xmm9
+  .byte  102,69,15,111,254                   // movdqa        %xmm14,%xmm15
+  .byte  102,65,15,114,215,13                // psrld         $0xd,%xmm15
+  .byte  102,69,15,250,203                   // psubd         %xmm11,%xmm9
+  .byte  102,69,15,254,207                   // paddd         %xmm15,%xmm9
   .byte  102,65,15,114,241,16                // pslld         $0x10,%xmm9
   .byte  102,65,15,114,225,16                // psrad         $0x10,%xmm9
-  .byte  102,69,15,107,201                   // packssdw      %xmm9,%xmm9
-  .byte  102,69,15,97,194                    // punpcklwd     %xmm10,%xmm8
-  .byte  102,69,15,97,217                    // punpcklwd     %xmm9,%xmm11
+  .byte  69,15,194,242,5                     // cmpnltps      %xmm10,%xmm14
+  .byte  69,15,84,241                        // andps         %xmm9,%xmm14
+  .byte  102,69,15,107,246                   // packssdw      %xmm14,%xmm14
+  .byte  102,69,15,97,196                    // punpcklwd     %xmm12,%xmm8
+  .byte  102,69,15,97,238                    // punpcklwd     %xmm14,%xmm13
   .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
-  .byte  102,69,15,98,203                    // punpckldq     %xmm11,%xmm9
+  .byte  102,69,15,98,205                    // punpckldq     %xmm13,%xmm9
   .byte  243,68,15,127,12,248                // movdqu        %xmm9,(%rax,%rdi,8)
-  .byte  102,69,15,106,195                   // punpckhdq     %xmm11,%xmm8
+  .byte  102,69,15,106,197                   // punpckhdq     %xmm13,%xmm8
   .byte  243,68,15,127,68,248,16             // movdqu        %xmm8,0x10(%rax,%rdi,8)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -26203,7 +26599,7 @@
   .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
   .byte  72,139,8                            // mov           (%rax),%rcx
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,132,15,1,0,0                     // je            3ca2 <_sk_linear_gradient_sse2+0x149>
+  .byte  15,132,15,1,0,0                     // je            3e99 <_sk_linear_gradient_sse2+0x149>
   .byte  72,139,64,8                         // mov           0x8(%rax),%rax
   .byte  72,131,192,32                       // add           $0x20,%rax
   .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
@@ -26264,8 +26660,8 @@
   .byte  69,15,86,231                        // orps          %xmm15,%xmm12
   .byte  72,131,192,36                       // add           $0x24,%rax
   .byte  72,255,201                          // dec           %rcx
-  .byte  15,133,8,255,255,255                // jne           3ba8 <_sk_linear_gradient_sse2+0x4f>
-  .byte  235,13                              // jmp           3caf <_sk_linear_gradient_sse2+0x156>
+  .byte  15,133,8,255,255,255                // jne           3d9f <_sk_linear_gradient_sse2+0x4f>
+  .byte  235,13                              // jmp           3ea6 <_sk_linear_gradient_sse2+0x156>
   .byte  15,87,201                           // xorps         %xmm1,%xmm1
   .byte  15,87,210                           // xorps         %xmm2,%xmm2
   .byte  15,87,219                           // xorps         %xmm3,%xmm3
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 1a3bb5e..073ad90 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -8327,94 +8327,163 @@
 
 PUBLIC _sk_load_f16_avx
 _sk_load_f16_avx LABEL PROC
+  DB  72,131,236,120                      ; sub           $0x78,%rsp
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,17,1,0,0                     ; jne           440f <_sk_load_f16_avx+0x11f>
+  DB  197,252,17,124,36,64                ; vmovups       %ymm7,0x40(%rsp)
+  DB  197,252,17,116,36,32                ; vmovups       %ymm6,0x20(%rsp)
+  DB  197,252,17,44,36                    ; vmovups       %ymm5,(%rsp)
+  DB  15,133,104,2,0,0                    ; jne           457b <_sk_load_f16_avx+0x28b>
   DB  197,121,16,4,248                    ; vmovupd       (%rax,%rdi,8),%xmm8
   DB  197,249,16,84,248,16                ; vmovupd       0x10(%rax,%rdi,8),%xmm2
-  DB  197,249,16,92,248,32                ; vmovupd       0x20(%rax,%rdi,8),%xmm3
+  DB  197,249,16,76,248,32                ; vmovupd       0x20(%rax,%rdi,8),%xmm1
   DB  197,122,111,76,248,48               ; vmovdqu       0x30(%rax,%rdi,8),%xmm9
   DB  197,185,97,194                      ; vpunpcklwd    %xmm2,%xmm8,%xmm0
   DB  197,185,105,210                     ; vpunpckhwd    %xmm2,%xmm8,%xmm2
-  DB  196,193,97,97,201                   ; vpunpcklwd    %xmm9,%xmm3,%xmm1
-  DB  196,193,97,105,217                  ; vpunpckhwd    %xmm9,%xmm3,%xmm3
-  DB  197,121,97,218                      ; vpunpcklwd    %xmm2,%xmm0,%xmm11
+  DB  196,193,113,97,217                  ; vpunpcklwd    %xmm9,%xmm1,%xmm3
+  DB  196,193,113,105,201                 ; vpunpckhwd    %xmm9,%xmm1,%xmm1
+  DB  197,121,97,242                      ; vpunpcklwd    %xmm2,%xmm0,%xmm14
   DB  197,121,105,194                     ; vpunpckhwd    %xmm2,%xmm0,%xmm8
-  DB  197,241,97,211                      ; vpunpcklwd    %xmm3,%xmm1,%xmm2
-  DB  197,113,105,203                     ; vpunpckhwd    %xmm3,%xmm1,%xmm9
-  DB  197,161,108,194                     ; vpunpcklqdq   %xmm2,%xmm11,%xmm0
-  DB  184,0,4,0,4                         ; mov           $0x4000400,%eax
-  DB  197,249,110,200                     ; vmovd         %eax,%xmm1
-  DB  197,121,112,233,0                   ; vpshufd       $0x0,%xmm1,%xmm13
-  DB  197,145,101,200                     ; vpcmpgtw      %xmm0,%xmm13,%xmm1
-  DB  197,241,223,192                     ; vpandn        %xmm0,%xmm1,%xmm0
-  DB  196,226,121,51,200                  ; vpmovzxwd     %xmm0,%xmm1
-  DB  196,65,41,239,210                   ; vpxor         %xmm10,%xmm10,%xmm10
-  DB  196,193,121,105,194                 ; vpunpckhwd    %xmm10,%xmm0,%xmm0
-  DB  197,241,114,241,13                  ; vpslld        $0xd,%xmm1,%xmm1
-  DB  197,249,114,240,13                  ; vpslld        $0xd,%xmm0,%xmm0
-  DB  196,227,117,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
-  DB  184,0,0,128,119                     ; mov           $0x77800000,%eax
+  DB  197,97,97,249                       ; vpunpcklwd    %xmm1,%xmm3,%xmm15
+  DB  197,97,105,209                      ; vpunpckhwd    %xmm1,%xmm3,%xmm10
+  DB  196,193,9,108,199                   ; vpunpcklqdq   %xmm15,%xmm14,%xmm0
+  DB  196,65,25,239,228                   ; vpxor         %xmm12,%xmm12,%xmm12
+  DB  196,193,121,105,204                 ; vpunpckhwd    %xmm12,%xmm0,%xmm1
+  DB  196,226,121,51,192                  ; vpmovzxwd     %xmm0,%xmm0
+  DB  196,227,125,24,193,1                ; vinsertf128   $0x1,%xmm1,%ymm0,%ymm0
+  DB  184,0,128,0,0                       ; mov           $0x8000,%eax
   DB  197,249,110,200                     ; vmovd         %eax,%xmm1
   DB  197,249,112,201,0                   ; vpshufd       $0x0,%xmm1,%xmm1
-  DB  196,99,117,24,225,1                 ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm12
-  DB  197,156,89,192                      ; vmulps        %ymm0,%ymm12,%ymm0
-  DB  197,161,109,202                     ; vpunpckhqdq   %xmm2,%xmm11,%xmm1
-  DB  197,145,101,209                     ; vpcmpgtw      %xmm1,%xmm13,%xmm2
-  DB  197,233,223,201                     ; vpandn        %xmm1,%xmm2,%xmm1
-  DB  196,226,121,51,209                  ; vpmovzxwd     %xmm1,%xmm2
-  DB  196,193,113,105,202                 ; vpunpckhwd    %xmm10,%xmm1,%xmm1
-  DB  197,233,114,242,13                  ; vpslld        $0xd,%xmm2,%xmm2
+  DB  196,99,117,24,201,1                 ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm9
+  DB  196,193,124,84,201                  ; vandps        %ymm9,%ymm0,%ymm1
+  DB  184,0,124,0,0                       ; mov           $0x7c00,%eax
+  DB  197,249,110,216                     ; vmovd         %eax,%xmm3
+  DB  197,249,112,219,0                   ; vpshufd       $0x0,%xmm3,%xmm3
+  DB  196,99,101,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm3,%ymm11
+  DB  196,193,124,84,219                  ; vandps        %ymm11,%ymm0,%ymm3
+  DB  197,252,87,193                      ; vxorps        %ymm1,%ymm0,%ymm0
+  DB  196,227,125,25,218,1                ; vextractf128  $0x1,%ymm3,%xmm2
+  DB  196,193,105,118,212                 ; vpcmpeqd      %xmm12,%xmm2,%xmm2
+  DB  196,193,97,118,220                  ; vpcmpeqd      %xmm12,%xmm3,%xmm3
+  DB  196,227,101,24,242,1                ; vinsertf128   $0x1,%xmm2,%ymm3,%ymm6
+  DB  196,227,125,25,203,1                ; vextractf128  $0x1,%ymm1,%xmm3
+  DB  197,145,114,243,16                  ; vpslld        $0x10,%xmm3,%xmm13
+  DB  196,227,125,25,195,1                ; vextractf128  $0x1,%ymm0,%xmm3
+  DB  197,233,114,243,13                  ; vpslld        $0xd,%xmm3,%xmm2
+  DB  184,0,0,0,56                        ; mov           $0x38000000,%eax
+  DB  197,249,110,216                     ; vmovd         %eax,%xmm3
+  DB  197,249,112,219,0                   ; vpshufd       $0x0,%xmm3,%xmm3
+  DB  197,145,254,251                     ; vpaddd        %xmm3,%xmm13,%xmm7
+  DB  197,193,254,210                     ; vpaddd        %xmm2,%xmm7,%xmm2
+  DB  197,241,114,241,16                  ; vpslld        $0x10,%xmm1,%xmm1
+  DB  197,249,114,240,13                  ; vpslld        $0xd,%xmm0,%xmm0
+  DB  197,241,254,203                     ; vpaddd        %xmm3,%xmm1,%xmm1
+  DB  197,241,254,192                     ; vpaddd        %xmm0,%xmm1,%xmm0
+  DB  196,227,125,24,194,1                ; vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
+  DB  196,65,20,87,237                    ; vxorps        %ymm13,%ymm13,%ymm13
+  DB  196,195,125,74,197,96               ; vblendvps     %ymm6,%ymm13,%ymm0,%ymm0
+  DB  196,193,9,109,207                   ; vpunpckhqdq   %xmm15,%xmm14,%xmm1
+  DB  196,193,113,105,212                 ; vpunpckhwd    %xmm12,%xmm1,%xmm2
+  DB  196,226,121,51,201                  ; vpmovzxwd     %xmm1,%xmm1
+  DB  196,227,117,24,202,1                ; vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
+  DB  196,193,116,84,209                  ; vandps        %ymm9,%ymm1,%ymm2
+  DB  196,193,116,84,243                  ; vandps        %ymm11,%ymm1,%ymm6
+  DB  197,244,87,202                      ; vxorps        %ymm2,%ymm1,%ymm1
+  DB  196,227,125,25,247,1                ; vextractf128  $0x1,%ymm6,%xmm7
+  DB  196,193,65,118,252                  ; vpcmpeqd      %xmm12,%xmm7,%xmm7
+  DB  196,193,73,118,244                  ; vpcmpeqd      %xmm12,%xmm6,%xmm6
+  DB  196,99,77,24,247,1                  ; vinsertf128   $0x1,%xmm7,%ymm6,%ymm14
+  DB  196,227,125,25,215,1                ; vextractf128  $0x1,%ymm2,%xmm7
+  DB  197,193,114,247,16                  ; vpslld        $0x10,%xmm7,%xmm7
+  DB  196,227,125,25,206,1                ; vextractf128  $0x1,%ymm1,%xmm6
+  DB  197,201,114,246,13                  ; vpslld        $0xd,%xmm6,%xmm6
+  DB  197,193,254,251                     ; vpaddd        %xmm3,%xmm7,%xmm7
+  DB  197,193,254,246                     ; vpaddd        %xmm6,%xmm7,%xmm6
+  DB  197,233,114,242,16                  ; vpslld        $0x10,%xmm2,%xmm2
   DB  197,241,114,241,13                  ; vpslld        $0xd,%xmm1,%xmm1
-  DB  196,227,109,24,201,1                ; vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
-  DB  197,156,89,201                      ; vmulps        %ymm1,%ymm12,%ymm1
-  DB  196,193,57,108,209                  ; vpunpcklqdq   %xmm9,%xmm8,%xmm2
-  DB  197,145,101,218                     ; vpcmpgtw      %xmm2,%xmm13,%xmm3
-  DB  197,225,223,210                     ; vpandn        %xmm2,%xmm3,%xmm2
-  DB  196,226,121,51,218                  ; vpmovzxwd     %xmm2,%xmm3
-  DB  196,193,105,105,210                 ; vpunpckhwd    %xmm10,%xmm2,%xmm2
-  DB  197,225,114,243,13                  ; vpslld        $0xd,%xmm3,%xmm3
+  DB  197,233,254,211                     ; vpaddd        %xmm3,%xmm2,%xmm2
+  DB  197,233,254,201                     ; vpaddd        %xmm1,%xmm2,%xmm1
+  DB  196,227,117,24,206,1                ; vinsertf128   $0x1,%xmm6,%ymm1,%ymm1
+  DB  196,195,117,74,205,224              ; vblendvps     %ymm14,%ymm13,%ymm1,%ymm1
+  DB  196,193,57,108,210                  ; vpunpcklqdq   %xmm10,%xmm8,%xmm2
+  DB  196,193,105,105,244                 ; vpunpckhwd    %xmm12,%xmm2,%xmm6
+  DB  196,226,121,51,210                  ; vpmovzxwd     %xmm2,%xmm2
+  DB  196,227,109,24,214,1                ; vinsertf128   $0x1,%xmm6,%ymm2,%ymm2
+  DB  196,193,108,84,243                  ; vandps        %ymm11,%ymm2,%ymm6
+  DB  196,227,125,25,247,1                ; vextractf128  $0x1,%ymm6,%xmm7
+  DB  196,193,65,118,252                  ; vpcmpeqd      %xmm12,%xmm7,%xmm7
+  DB  196,193,73,118,244                  ; vpcmpeqd      %xmm12,%xmm6,%xmm6
+  DB  196,99,77,24,247,1                  ; vinsertf128   $0x1,%xmm7,%ymm6,%ymm14
+  DB  196,193,108,84,249                  ; vandps        %ymm9,%ymm2,%ymm7
+  DB  197,236,87,215                      ; vxorps        %ymm7,%ymm2,%ymm2
+  DB  196,227,125,25,254,1                ; vextractf128  $0x1,%ymm7,%xmm6
+  DB  197,129,114,246,16                  ; vpslld        $0x10,%xmm6,%xmm15
+  DB  196,227,125,25,214,1                ; vextractf128  $0x1,%ymm2,%xmm6
+  DB  197,209,114,246,13                  ; vpslld        $0xd,%xmm6,%xmm5
+  DB  197,129,254,243                     ; vpaddd        %xmm3,%xmm15,%xmm6
+  DB  197,201,254,237                     ; vpaddd        %xmm5,%xmm6,%xmm5
+  DB  197,201,114,247,16                  ; vpslld        $0x10,%xmm7,%xmm6
   DB  197,233,114,242,13                  ; vpslld        $0xd,%xmm2,%xmm2
-  DB  196,227,101,24,210,1                ; vinsertf128   $0x1,%xmm2,%ymm3,%ymm2
-  DB  197,156,89,210                      ; vmulps        %ymm2,%ymm12,%ymm2
-  DB  196,65,57,109,193                   ; vpunpckhqdq   %xmm9,%xmm8,%xmm8
-  DB  196,193,17,101,216                  ; vpcmpgtw      %xmm8,%xmm13,%xmm3
-  DB  196,193,97,223,216                  ; vpandn        %xmm8,%xmm3,%xmm3
-  DB  196,98,121,51,195                   ; vpmovzxwd     %xmm3,%xmm8
-  DB  196,193,97,105,218                  ; vpunpckhwd    %xmm10,%xmm3,%xmm3
-  DB  196,193,57,114,240,13               ; vpslld        $0xd,%xmm8,%xmm8
-  DB  197,225,114,243,13                  ; vpslld        $0xd,%xmm3,%xmm3
-  DB  196,227,61,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  DB  197,156,89,219                      ; vmulps        %ymm3,%ymm12,%ymm3
+  DB  197,201,254,243                     ; vpaddd        %xmm3,%xmm6,%xmm6
+  DB  197,201,254,210                     ; vpaddd        %xmm2,%xmm6,%xmm2
+  DB  196,227,109,24,213,1                ; vinsertf128   $0x1,%xmm5,%ymm2,%ymm2
+  DB  196,195,109,74,213,224              ; vblendvps     %ymm14,%ymm13,%ymm2,%ymm2
+  DB  196,193,57,109,234                  ; vpunpckhqdq   %xmm10,%xmm8,%xmm5
+  DB  196,193,81,105,244                  ; vpunpckhwd    %xmm12,%xmm5,%xmm6
+  DB  196,226,121,51,237                  ; vpmovzxwd     %xmm5,%xmm5
+  DB  196,227,85,24,238,1                 ; vinsertf128   $0x1,%xmm6,%ymm5,%ymm5
+  DB  196,193,84,84,243                   ; vandps        %ymm11,%ymm5,%ymm6
+  DB  196,227,125,25,247,1                ; vextractf128  $0x1,%ymm6,%xmm7
+  DB  196,193,65,118,252                  ; vpcmpeqd      %xmm12,%xmm7,%xmm7
+  DB  196,193,73,118,244                  ; vpcmpeqd      %xmm12,%xmm6,%xmm6
+  DB  196,65,84,84,193                    ; vandps        %ymm9,%ymm5,%ymm8
+  DB  196,193,84,87,232                   ; vxorps        %ymm8,%ymm5,%ymm5
+  DB  196,99,77,24,207,1                  ; vinsertf128   $0x1,%xmm7,%ymm6,%ymm9
+  DB  196,99,125,25,199,1                 ; vextractf128  $0x1,%ymm8,%xmm7
+  DB  197,193,114,247,16                  ; vpslld        $0x10,%xmm7,%xmm7
+  DB  196,193,73,114,240,16               ; vpslld        $0x10,%xmm8,%xmm6
+  DB  197,201,254,243                     ; vpaddd        %xmm3,%xmm6,%xmm6
+  DB  197,193,254,219                     ; vpaddd        %xmm3,%xmm7,%xmm3
+  DB  196,227,125,25,239,1                ; vextractf128  $0x1,%ymm5,%xmm7
+  DB  197,193,114,247,13                  ; vpslld        $0xd,%xmm7,%xmm7
+  DB  197,225,254,223                     ; vpaddd        %xmm7,%xmm3,%xmm3
+  DB  197,209,114,245,13                  ; vpslld        $0xd,%xmm5,%xmm5
+  DB  197,201,254,237                     ; vpaddd        %xmm5,%xmm6,%xmm5
+  DB  196,227,85,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm5,%ymm3
+  DB  196,195,101,74,221,144              ; vblendvps     %ymm9,%ymm13,%ymm3,%ymm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  197,252,16,44,36                    ; vmovups       (%rsp),%ymm5
+  DB  197,252,16,116,36,32                ; vmovups       0x20(%rsp),%ymm6
+  DB  197,252,16,124,36,64                ; vmovups       0x40(%rsp),%ymm7
+  DB  72,131,196,120                      ; add           $0x78,%rsp
   DB  255,224                             ; jmpq          *%rax
   DB  197,123,16,4,248                    ; vmovsd        (%rax,%rdi,8),%xmm8
   DB  196,65,49,239,201                   ; vpxor         %xmm9,%xmm9,%xmm9
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,79                              ; je            446e <_sk_load_f16_avx+0x17e>
+  DB  116,79                              ; je            45da <_sk_load_f16_avx+0x2ea>
   DB  197,57,22,68,248,8                  ; vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,67                              ; jb            446e <_sk_load_f16_avx+0x17e>
+  DB  114,67                              ; jb            45da <_sk_load_f16_avx+0x2ea>
   DB  197,251,16,84,248,16                ; vmovsd        0x10(%rax,%rdi,8),%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  116,68                              ; je            447b <_sk_load_f16_avx+0x18b>
+  DB  116,68                              ; je            45e7 <_sk_load_f16_avx+0x2f7>
   DB  197,233,22,84,248,24                ; vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,56                              ; jb            447b <_sk_load_f16_avx+0x18b>
-  DB  197,251,16,92,248,32                ; vmovsd        0x20(%rax,%rdi,8),%xmm3
+  DB  114,56                              ; jb            45e7 <_sk_load_f16_avx+0x2f7>
+  DB  197,251,16,76,248,32                ; vmovsd        0x20(%rax,%rdi,8),%xmm1
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  15,132,194,254,255,255              ; je            4315 <_sk_load_f16_avx+0x25>
-  DB  197,225,22,92,248,40                ; vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
+  DB  15,132,107,253,255,255              ; je            432a <_sk_load_f16_avx+0x3a>
+  DB  197,241,22,76,248,40                ; vmovhpd       0x28(%rax,%rdi,8),%xmm1,%xmm1
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,178,254,255,255              ; jb            4315 <_sk_load_f16_avx+0x25>
+  DB  15,130,91,253,255,255               ; jb            432a <_sk_load_f16_avx+0x3a>
   DB  197,122,126,76,248,48               ; vmovq         0x30(%rax,%rdi,8),%xmm9
-  DB  233,167,254,255,255                 ; jmpq          4315 <_sk_load_f16_avx+0x25>
-  DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
+  DB  233,80,253,255,255                  ; jmpq          432a <_sk_load_f16_avx+0x3a>
+  DB  197,241,87,201                      ; vxorpd        %xmm1,%xmm1,%xmm1
   DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,154,254,255,255                 ; jmpq          4315 <_sk_load_f16_avx+0x25>
-  DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,145,254,255,255                 ; jmpq          4315 <_sk_load_f16_avx+0x25>
+  DB  233,67,253,255,255                  ; jmpq          432a <_sk_load_f16_avx+0x3a>
+  DB  197,241,87,201                      ; vxorpd        %xmm1,%xmm1,%xmm1
+  DB  233,58,253,255,255                  ; jmpq          432a <_sk_load_f16_avx+0x3a>
 
 PUBLIC _sk_gather_f16_avx
 _sk_gather_f16_avx LABEL PROC
@@ -8422,6 +8491,11 @@
   DB  65,86                               ; push          %r14
   DB  65,84                               ; push          %r12
   DB  83                                  ; push          %rbx
+  DB  72,129,236,152,0,0,0                ; sub           $0x98,%rsp
+  DB  197,252,17,124,36,96                ; vmovups       %ymm7,0x60(%rsp)
+  DB  197,252,17,116,36,64                ; vmovups       %ymm6,0x40(%rsp)
+  DB  197,252,17,108,36,32                ; vmovups       %ymm5,0x20(%rsp)
+  DB  197,252,17,36,36                    ; vmovups       %ymm4,(%rsp)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  197,254,91,209                      ; vcvttps2dq    %ymm1,%ymm2
@@ -8462,55 +8536,121 @@
   DB  197,177,105,201                     ; vpunpckhwd    %xmm1,%xmm9,%xmm1
   DB  197,169,97,211                      ; vpunpcklwd    %xmm3,%xmm10,%xmm2
   DB  197,169,105,219                     ; vpunpckhwd    %xmm3,%xmm10,%xmm3
-  DB  197,121,97,217                      ; vpunpcklwd    %xmm1,%xmm0,%xmm11
+  DB  197,121,97,241                      ; vpunpcklwd    %xmm1,%xmm0,%xmm14
   DB  197,121,105,193                     ; vpunpckhwd    %xmm1,%xmm0,%xmm8
-  DB  197,233,97,203                      ; vpunpcklwd    %xmm3,%xmm2,%xmm1
-  DB  197,105,105,203                     ; vpunpckhwd    %xmm3,%xmm2,%xmm9
-  DB  197,161,108,193                     ; vpunpcklqdq   %xmm1,%xmm11,%xmm0
-  DB  184,0,4,0,4                         ; mov           $0x4000400,%eax
-  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
-  DB  197,121,112,234,0                   ; vpshufd       $0x0,%xmm2,%xmm13
-  DB  197,145,101,208                     ; vpcmpgtw      %xmm0,%xmm13,%xmm2
-  DB  197,233,223,192                     ; vpandn        %xmm0,%xmm2,%xmm0
-  DB  196,226,121,51,208                  ; vpmovzxwd     %xmm0,%xmm2
-  DB  196,65,41,239,210                   ; vpxor         %xmm10,%xmm10,%xmm10
-  DB  196,193,121,105,194                 ; vpunpckhwd    %xmm10,%xmm0,%xmm0
-  DB  197,233,114,242,13                  ; vpslld        $0xd,%xmm2,%xmm2
-  DB  197,249,114,240,13                  ; vpslld        $0xd,%xmm0,%xmm0
-  DB  196,227,109,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm2,%ymm0
-  DB  184,0,0,128,119                     ; mov           $0x77800000,%eax
+  DB  197,105,97,251                      ; vpunpcklwd    %xmm3,%xmm2,%xmm15
+  DB  197,105,105,211                     ; vpunpckhwd    %xmm3,%xmm2,%xmm10
+  DB  196,193,9,108,199                   ; vpunpcklqdq   %xmm15,%xmm14,%xmm0
+  DB  196,65,25,239,228                   ; vpxor         %xmm12,%xmm12,%xmm12
+  DB  196,193,121,105,212                 ; vpunpckhwd    %xmm12,%xmm0,%xmm2
+  DB  196,226,121,51,192                  ; vpmovzxwd     %xmm0,%xmm0
+  DB  196,227,125,24,194,1                ; vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
+  DB  184,0,128,0,0                       ; mov           $0x8000,%eax
   DB  197,249,110,208                     ; vmovd         %eax,%xmm2
   DB  197,249,112,210,0                   ; vpshufd       $0x0,%xmm2,%xmm2
-  DB  196,99,109,24,226,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm12
-  DB  197,156,89,192                      ; vmulps        %ymm0,%ymm12,%ymm0
-  DB  197,161,109,201                     ; vpunpckhqdq   %xmm1,%xmm11,%xmm1
-  DB  197,145,101,209                     ; vpcmpgtw      %xmm1,%xmm13,%xmm2
-  DB  197,233,223,201                     ; vpandn        %xmm1,%xmm2,%xmm1
-  DB  196,226,121,51,209                  ; vpmovzxwd     %xmm1,%xmm2
-  DB  196,193,113,105,202                 ; vpunpckhwd    %xmm10,%xmm1,%xmm1
-  DB  197,233,114,242,13                  ; vpslld        $0xd,%xmm2,%xmm2
+  DB  196,99,109,24,202,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm9
+  DB  196,193,124,84,209                  ; vandps        %ymm9,%ymm0,%ymm2
+  DB  184,0,124,0,0                       ; mov           $0x7c00,%eax
+  DB  197,249,110,216                     ; vmovd         %eax,%xmm3
+  DB  197,249,112,219,0                   ; vpshufd       $0x0,%xmm3,%xmm3
+  DB  196,99,101,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm3,%ymm11
+  DB  196,193,124,84,219                  ; vandps        %ymm11,%ymm0,%ymm3
+  DB  197,252,87,194                      ; vxorps        %ymm2,%ymm0,%ymm0
+  DB  196,227,125,25,217,1                ; vextractf128  $0x1,%ymm3,%xmm1
+  DB  196,193,113,118,204                 ; vpcmpeqd      %xmm12,%xmm1,%xmm1
+  DB  196,193,97,118,220                  ; vpcmpeqd      %xmm12,%xmm3,%xmm3
+  DB  196,227,101,24,225,1                ; vinsertf128   $0x1,%xmm1,%ymm3,%ymm4
+  DB  196,227,125,25,211,1                ; vextractf128  $0x1,%ymm2,%xmm3
+  DB  197,145,114,243,16                  ; vpslld        $0x10,%xmm3,%xmm13
+  DB  196,227,125,25,195,1                ; vextractf128  $0x1,%ymm0,%xmm3
+  DB  197,241,114,243,13                  ; vpslld        $0xd,%xmm3,%xmm1
+  DB  184,0,0,0,56                        ; mov           $0x38000000,%eax
+  DB  197,249,110,216                     ; vmovd         %eax,%xmm3
+  DB  197,249,112,219,0                   ; vpshufd       $0x0,%xmm3,%xmm3
+  DB  197,145,254,251                     ; vpaddd        %xmm3,%xmm13,%xmm7
+  DB  197,193,254,201                     ; vpaddd        %xmm1,%xmm7,%xmm1
+  DB  197,233,114,242,16                  ; vpslld        $0x10,%xmm2,%xmm2
+  DB  197,249,114,240,13                  ; vpslld        $0xd,%xmm0,%xmm0
+  DB  197,233,254,211                     ; vpaddd        %xmm3,%xmm2,%xmm2
+  DB  197,233,254,192                     ; vpaddd        %xmm0,%xmm2,%xmm0
+  DB  196,227,125,24,193,1                ; vinsertf128   $0x1,%xmm1,%ymm0,%ymm0
+  DB  196,65,20,87,237                    ; vxorps        %ymm13,%ymm13,%ymm13
+  DB  196,195,125,74,197,64               ; vblendvps     %ymm4,%ymm13,%ymm0,%ymm0
+  DB  196,193,9,109,207                   ; vpunpckhqdq   %xmm15,%xmm14,%xmm1
+  DB  196,193,113,105,212                 ; vpunpckhwd    %xmm12,%xmm1,%xmm2
+  DB  196,226,121,51,201                  ; vpmovzxwd     %xmm1,%xmm1
+  DB  196,227,117,24,202,1                ; vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
+  DB  196,193,116,84,209                  ; vandps        %ymm9,%ymm1,%ymm2
+  DB  196,193,116,84,227                  ; vandps        %ymm11,%ymm1,%ymm4
+  DB  197,244,87,202                      ; vxorps        %ymm2,%ymm1,%ymm1
+  DB  196,227,125,25,231,1                ; vextractf128  $0x1,%ymm4,%xmm7
+  DB  196,193,65,118,252                  ; vpcmpeqd      %xmm12,%xmm7,%xmm7
+  DB  196,193,89,118,228                  ; vpcmpeqd      %xmm12,%xmm4,%xmm4
+  DB  196,227,93,24,231,1                 ; vinsertf128   $0x1,%xmm7,%ymm4,%ymm4
+  DB  196,227,125,25,215,1                ; vextractf128  $0x1,%ymm2,%xmm7
+  DB  197,193,114,247,16                  ; vpslld        $0x10,%xmm7,%xmm7
+  DB  196,227,125,25,206,1                ; vextractf128  $0x1,%ymm1,%xmm6
+  DB  197,201,114,246,13                  ; vpslld        $0xd,%xmm6,%xmm6
+  DB  197,193,254,251                     ; vpaddd        %xmm3,%xmm7,%xmm7
+  DB  197,193,254,246                     ; vpaddd        %xmm6,%xmm7,%xmm6
+  DB  197,233,114,242,16                  ; vpslld        $0x10,%xmm2,%xmm2
   DB  197,241,114,241,13                  ; vpslld        $0xd,%xmm1,%xmm1
-  DB  196,227,109,24,201,1                ; vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
-  DB  197,156,89,201                      ; vmulps        %ymm1,%ymm12,%ymm1
-  DB  196,193,57,108,209                  ; vpunpcklqdq   %xmm9,%xmm8,%xmm2
-  DB  197,145,101,218                     ; vpcmpgtw      %xmm2,%xmm13,%xmm3
-  DB  197,225,223,210                     ; vpandn        %xmm2,%xmm3,%xmm2
-  DB  196,226,121,51,218                  ; vpmovzxwd     %xmm2,%xmm3
-  DB  196,193,105,105,210                 ; vpunpckhwd    %xmm10,%xmm2,%xmm2
-  DB  197,225,114,243,13                  ; vpslld        $0xd,%xmm3,%xmm3
+  DB  197,233,254,211                     ; vpaddd        %xmm3,%xmm2,%xmm2
+  DB  197,233,254,201                     ; vpaddd        %xmm1,%xmm2,%xmm1
+  DB  196,227,117,24,206,1                ; vinsertf128   $0x1,%xmm6,%ymm1,%ymm1
+  DB  196,195,117,74,205,64               ; vblendvps     %ymm4,%ymm13,%ymm1,%ymm1
+  DB  196,193,57,108,210                  ; vpunpcklqdq   %xmm10,%xmm8,%xmm2
+  DB  196,193,105,105,228                 ; vpunpckhwd    %xmm12,%xmm2,%xmm4
+  DB  196,226,121,51,210                  ; vpmovzxwd     %xmm2,%xmm2
+  DB  196,227,109,24,212,1                ; vinsertf128   $0x1,%xmm4,%ymm2,%ymm2
+  DB  196,193,108,84,227                  ; vandps        %ymm11,%ymm2,%ymm4
+  DB  196,227,125,25,230,1                ; vextractf128  $0x1,%ymm4,%xmm6
+  DB  196,193,73,118,244                  ; vpcmpeqd      %xmm12,%xmm6,%xmm6
+  DB  196,193,89,118,228                  ; vpcmpeqd      %xmm12,%xmm4,%xmm4
+  DB  196,227,93,24,230,1                 ; vinsertf128   $0x1,%xmm6,%ymm4,%ymm4
+  DB  196,193,108,84,241                  ; vandps        %ymm9,%ymm2,%ymm6
+  DB  197,236,87,214                      ; vxorps        %ymm6,%ymm2,%ymm2
+  DB  196,227,125,25,247,1                ; vextractf128  $0x1,%ymm6,%xmm7
+  DB  197,193,114,247,16                  ; vpslld        $0x10,%xmm7,%xmm7
+  DB  196,227,125,25,213,1                ; vextractf128  $0x1,%ymm2,%xmm5
+  DB  197,209,114,245,13                  ; vpslld        $0xd,%xmm5,%xmm5
+  DB  197,193,254,251                     ; vpaddd        %xmm3,%xmm7,%xmm7
+  DB  197,193,254,237                     ; vpaddd        %xmm5,%xmm7,%xmm5
+  DB  197,201,114,246,16                  ; vpslld        $0x10,%xmm6,%xmm6
   DB  197,233,114,242,13                  ; vpslld        $0xd,%xmm2,%xmm2
-  DB  196,227,101,24,210,1                ; vinsertf128   $0x1,%xmm2,%ymm3,%ymm2
-  DB  197,156,89,210                      ; vmulps        %ymm2,%ymm12,%ymm2
-  DB  196,65,57,109,193                   ; vpunpckhqdq   %xmm9,%xmm8,%xmm8
-  DB  196,193,17,101,216                  ; vpcmpgtw      %xmm8,%xmm13,%xmm3
-  DB  196,193,97,223,216                  ; vpandn        %xmm8,%xmm3,%xmm3
-  DB  196,98,121,51,195                   ; vpmovzxwd     %xmm3,%xmm8
-  DB  196,193,97,105,218                  ; vpunpckhwd    %xmm10,%xmm3,%xmm3
-  DB  196,193,57,114,240,13               ; vpslld        $0xd,%xmm8,%xmm8
-  DB  197,225,114,243,13                  ; vpslld        $0xd,%xmm3,%xmm3
-  DB  196,227,61,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  DB  197,156,89,219                      ; vmulps        %ymm3,%ymm12,%ymm3
+  DB  197,201,254,243                     ; vpaddd        %xmm3,%xmm6,%xmm6
+  DB  197,201,254,210                     ; vpaddd        %xmm2,%xmm6,%xmm2
+  DB  196,227,109,24,213,1                ; vinsertf128   $0x1,%xmm5,%ymm2,%ymm2
+  DB  196,195,109,74,213,64               ; vblendvps     %ymm4,%ymm13,%ymm2,%ymm2
+  DB  196,193,57,109,226                  ; vpunpckhqdq   %xmm10,%xmm8,%xmm4
+  DB  196,193,89,105,236                  ; vpunpckhwd    %xmm12,%xmm4,%xmm5
+  DB  196,226,121,51,228                  ; vpmovzxwd     %xmm4,%xmm4
+  DB  196,227,93,24,229,1                 ; vinsertf128   $0x1,%xmm5,%ymm4,%ymm4
+  DB  196,193,92,84,235                   ; vandps        %ymm11,%ymm4,%ymm5
+  DB  196,227,125,25,238,1                ; vextractf128  $0x1,%ymm5,%xmm6
+  DB  196,193,73,118,244                  ; vpcmpeqd      %xmm12,%xmm6,%xmm6
+  DB  196,193,81,118,236                  ; vpcmpeqd      %xmm12,%xmm5,%xmm5
+  DB  196,193,92,84,249                   ; vandps        %ymm9,%ymm4,%ymm7
+  DB  197,220,87,231                      ; vxorps        %ymm7,%ymm4,%ymm4
+  DB  196,227,85,24,238,1                 ; vinsertf128   $0x1,%xmm6,%ymm5,%ymm5
+  DB  196,227,125,25,254,1                ; vextractf128  $0x1,%ymm7,%xmm6
+  DB  197,201,114,246,16                  ; vpslld        $0x10,%xmm6,%xmm6
+  DB  197,193,114,247,16                  ; vpslld        $0x10,%xmm7,%xmm7
+  DB  197,193,254,251                     ; vpaddd        %xmm3,%xmm7,%xmm7
+  DB  197,201,254,219                     ; vpaddd        %xmm3,%xmm6,%xmm3
+  DB  196,227,125,25,230,1                ; vextractf128  $0x1,%ymm4,%xmm6
+  DB  197,201,114,246,13                  ; vpslld        $0xd,%xmm6,%xmm6
+  DB  197,225,254,222                     ; vpaddd        %xmm6,%xmm3,%xmm3
+  DB  197,217,114,244,13                  ; vpslld        $0xd,%xmm4,%xmm4
+  DB  197,193,254,228                     ; vpaddd        %xmm4,%xmm7,%xmm4
+  DB  196,227,93,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm4,%ymm3
+  DB  196,195,101,74,221,80               ; vblendvps     %ymm5,%ymm13,%ymm3,%ymm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  197,252,16,36,36                    ; vmovups       (%rsp),%ymm4
+  DB  197,252,16,108,36,32                ; vmovups       0x20(%rsp),%ymm5
+  DB  197,252,16,116,36,64                ; vmovups       0x40(%rsp),%ymm6
+  DB  197,252,16,124,36,96                ; vmovups       0x60(%rsp),%ymm7
+  DB  72,129,196,152,0,0,0                ; add           $0x98,%rsp
   DB  91                                  ; pop           %rbx
   DB  65,92                               ; pop           %r12
   DB  65,94                               ; pop           %r14
@@ -8519,66 +8659,136 @@
 
 PUBLIC _sk_store_f16_avx
 _sk_store_f16_avx LABEL PROC
+  DB  72,129,236,216,0,0,0                ; sub           $0xd8,%rsp
+  DB  197,252,17,188,36,160,0,0,0         ; vmovups       %ymm7,0xa0(%rsp)
+  DB  197,252,17,180,36,128,0,0,0         ; vmovups       %ymm6,0x80(%rsp)
+  DB  197,252,17,108,36,96                ; vmovups       %ymm5,0x60(%rsp)
+  DB  197,252,17,100,36,64                ; vmovups       %ymm4,0x40(%rsp)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,0                            ; mov           (%rax),%r8
-  DB  184,0,0,128,7                       ; mov           $0x7800000,%eax
+  DB  184,0,0,0,128                       ; mov           $0x80000000,%eax
   DB  197,121,110,192                     ; vmovd         %eax,%xmm8
   DB  196,65,121,112,192,0                ; vpshufd       $0x0,%xmm8,%xmm8
+  DB  196,67,61,24,200,1                  ; vinsertf128   $0x1,%xmm8,%ymm8,%ymm9
+  DB  197,52,84,208                       ; vandps        %ymm0,%ymm9,%ymm10
+  DB  197,252,17,4,36                     ; vmovups       %ymm0,(%rsp)
+  DB  196,65,124,87,218                   ; vxorps        %ymm10,%ymm0,%ymm11
+  DB  184,0,0,128,56                      ; mov           $0x38800000,%eax
+  DB  197,121,110,192                     ; vmovd         %eax,%xmm8
+  DB  196,67,121,4,192,0                  ; vpermilps     $0x0,%xmm8,%xmm8
   DB  196,67,61,24,192,1                  ; vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
-  DB  197,60,89,200                       ; vmulps        %ymm0,%ymm8,%ymm9
-  DB  196,67,125,25,202,1                 ; vextractf128  $0x1,%ymm9,%xmm10
-  DB  196,193,41,114,210,13               ; vpsrld        $0xd,%xmm10,%xmm10
-  DB  196,193,49,114,209,13               ; vpsrld        $0xd,%xmm9,%xmm9
-  DB  196,66,49,43,202                    ; vpackusdw     %xmm10,%xmm9,%xmm9
-  DB  197,60,89,209                       ; vmulps        %ymm1,%ymm8,%ymm10
-  DB  196,67,125,25,211,1                 ; vextractf128  $0x1,%ymm10,%xmm11
+  DB  196,65,36,194,224,1                 ; vcmpltps      %ymm8,%ymm11,%ymm12
+  DB  196,67,125,25,213,1                 ; vextractf128  $0x1,%ymm10,%xmm13
+  DB  196,193,17,114,213,16               ; vpsrld        $0x10,%xmm13,%xmm13
+  DB  196,193,9,114,210,16                ; vpsrld        $0x10,%xmm10,%xmm14
+  DB  196,193,1,114,211,13                ; vpsrld        $0xd,%xmm11,%xmm15
+  DB  196,67,125,25,218,1                 ; vextractf128  $0x1,%ymm11,%xmm10
+  DB  196,193,33,114,210,13               ; vpsrld        $0xd,%xmm10,%xmm11
+  DB  184,0,192,1,0                       ; mov           $0x1c000,%eax
+  DB  197,121,110,208                     ; vmovd         %eax,%xmm10
+  DB  196,65,121,112,210,0                ; vpshufd       $0x0,%xmm10,%xmm10
+  DB  196,65,9,250,242                    ; vpsubd        %xmm10,%xmm14,%xmm14
+  DB  196,65,17,250,234                   ; vpsubd        %xmm10,%xmm13,%xmm13
+  DB  196,65,17,254,219                   ; vpaddd        %xmm11,%xmm13,%xmm11
+  DB  196,65,9,254,239                    ; vpaddd        %xmm15,%xmm14,%xmm13
+  DB  196,67,21,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm13,%ymm13
+  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
+  DB  196,99,21,74,224,192                ; vblendvps     %ymm12,%ymm0,%ymm13,%ymm12
+  DB  197,52,84,233                       ; vandps        %ymm1,%ymm9,%ymm13
+  DB  197,252,17,76,36,32                 ; vmovups       %ymm1,0x20(%rsp)
+  DB  196,65,116,87,245                   ; vxorps        %ymm13,%ymm1,%ymm14
+  DB  196,67,125,25,239,1                 ; vextractf128  $0x1,%ymm13,%xmm15
+  DB  196,193,1,114,215,16                ; vpsrld        $0x10,%xmm15,%xmm15
+  DB  196,67,125,25,243,1                 ; vextractf128  $0x1,%ymm14,%xmm11
   DB  196,193,33,114,211,13               ; vpsrld        $0xd,%xmm11,%xmm11
-  DB  196,193,41,114,210,13               ; vpsrld        $0xd,%xmm10,%xmm10
-  DB  196,66,41,43,211                    ; vpackusdw     %xmm11,%xmm10,%xmm10
-  DB  197,60,89,218                       ; vmulps        %ymm2,%ymm8,%ymm11
-  DB  196,67,125,25,220,1                 ; vextractf128  $0x1,%ymm11,%xmm12
-  DB  196,193,25,114,212,13               ; vpsrld        $0xd,%xmm12,%xmm12
-  DB  196,193,33,114,211,13               ; vpsrld        $0xd,%xmm11,%xmm11
-  DB  196,66,33,43,220                    ; vpackusdw     %xmm12,%xmm11,%xmm11
-  DB  197,60,89,195                       ; vmulps        %ymm3,%ymm8,%ymm8
-  DB  196,67,125,25,196,1                 ; vextractf128  $0x1,%ymm8,%xmm12
-  DB  196,193,25,114,212,13               ; vpsrld        $0xd,%xmm12,%xmm12
-  DB  196,193,57,114,208,13               ; vpsrld        $0xd,%xmm8,%xmm8
-  DB  196,66,57,43,196                    ; vpackusdw     %xmm12,%xmm8,%xmm8
-  DB  196,65,49,97,226                    ; vpunpcklwd    %xmm10,%xmm9,%xmm12
-  DB  196,65,49,105,234                   ; vpunpckhwd    %xmm10,%xmm9,%xmm13
-  DB  196,65,33,97,200                    ; vpunpcklwd    %xmm8,%xmm11,%xmm9
-  DB  196,65,33,105,192                   ; vpunpckhwd    %xmm8,%xmm11,%xmm8
-  DB  196,65,25,98,217                    ; vpunpckldq    %xmm9,%xmm12,%xmm11
-  DB  196,65,25,106,209                   ; vpunpckhdq    %xmm9,%xmm12,%xmm10
-  DB  196,65,17,98,200                    ; vpunpckldq    %xmm8,%xmm13,%xmm9
-  DB  196,65,17,106,192                   ; vpunpckhdq    %xmm8,%xmm13,%xmm8
+  DB  196,193,1,250,250                   ; vpsubd        %xmm10,%xmm15,%xmm7
+  DB  196,193,65,254,251                  ; vpaddd        %xmm11,%xmm7,%xmm7
+  DB  196,193,73,114,213,16               ; vpsrld        $0x10,%xmm13,%xmm6
+  DB  196,193,73,250,242                  ; vpsubd        %xmm10,%xmm6,%xmm6
+  DB  196,193,81,114,214,13               ; vpsrld        $0xd,%xmm14,%xmm5
+  DB  197,201,254,237                     ; vpaddd        %xmm5,%xmm6,%xmm5
+  DB  196,193,12,194,240,1                ; vcmpltps      %ymm8,%ymm14,%ymm6
+  DB  196,227,85,24,239,1                 ; vinsertf128   $0x1,%xmm7,%ymm5,%ymm5
+  DB  196,99,85,74,232,96                 ; vblendvps     %ymm6,%ymm0,%ymm5,%ymm13
+  DB  197,180,84,234                      ; vandps        %ymm2,%ymm9,%ymm5
+  DB  196,227,125,25,238,1                ; vextractf128  $0x1,%ymm5,%xmm6
+  DB  197,201,114,214,16                  ; vpsrld        $0x10,%xmm6,%xmm6
+  DB  197,236,87,253                      ; vxorps        %ymm5,%ymm2,%ymm7
+  DB  196,227,125,25,252,1                ; vextractf128  $0x1,%ymm7,%xmm4
+  DB  197,217,114,212,13                  ; vpsrld        $0xd,%xmm4,%xmm4
+  DB  196,193,73,250,242                  ; vpsubd        %xmm10,%xmm6,%xmm6
+  DB  197,201,254,228                     ; vpaddd        %xmm4,%xmm6,%xmm4
+  DB  197,209,114,213,16                  ; vpsrld        $0x10,%xmm5,%xmm5
+  DB  196,193,81,250,234                  ; vpsubd        %xmm10,%xmm5,%xmm5
+  DB  197,201,114,215,13                  ; vpsrld        $0xd,%xmm7,%xmm6
+  DB  197,209,254,238                     ; vpaddd        %xmm6,%xmm5,%xmm5
+  DB  196,227,85,24,228,1                 ; vinsertf128   $0x1,%xmm4,%ymm5,%ymm4
+  DB  196,193,68,194,232,1                ; vcmpltps      %ymm8,%ymm7,%ymm5
+  DB  196,227,93,74,224,80                ; vblendvps     %ymm5,%ymm0,%ymm4,%ymm4
+  DB  197,180,84,235                      ; vandps        %ymm3,%ymm9,%ymm5
+  DB  196,227,125,25,238,1                ; vextractf128  $0x1,%ymm5,%xmm6
+  DB  197,201,114,214,16                  ; vpsrld        $0x10,%xmm6,%xmm6
+  DB  197,193,114,213,16                  ; vpsrld        $0x10,%xmm5,%xmm7
+  DB  196,193,65,250,250                  ; vpsubd        %xmm10,%xmm7,%xmm7
+  DB  196,193,73,250,242                  ; vpsubd        %xmm10,%xmm6,%xmm6
+  DB  197,228,87,237                      ; vxorps        %ymm5,%ymm3,%ymm5
+  DB  196,227,125,25,233,1                ; vextractf128  $0x1,%ymm5,%xmm1
+  DB  197,241,114,209,13                  ; vpsrld        $0xd,%xmm1,%xmm1
+  DB  197,201,254,201                     ; vpaddd        %xmm1,%xmm6,%xmm1
+  DB  196,193,84,194,240,1                ; vcmpltps      %ymm8,%ymm5,%ymm6
+  DB  197,209,114,213,13                  ; vpsrld        $0xd,%xmm5,%xmm5
+  DB  197,193,254,237                     ; vpaddd        %xmm5,%xmm7,%xmm5
+  DB  196,227,85,24,201,1                 ; vinsertf128   $0x1,%xmm1,%ymm5,%ymm1
+  DB  196,227,117,74,192,96               ; vblendvps     %ymm6,%ymm0,%ymm1,%ymm0
+  DB  196,99,125,25,225,1                 ; vextractf128  $0x1,%ymm12,%xmm1
+  DB  196,226,25,43,201                   ; vpackusdw     %xmm1,%xmm12,%xmm1
+  DB  196,99,125,25,237,1                 ; vextractf128  $0x1,%ymm13,%xmm5
+  DB  196,226,17,43,237                   ; vpackusdw     %xmm5,%xmm13,%xmm5
+  DB  196,227,125,25,230,1                ; vextractf128  $0x1,%ymm4,%xmm6
+  DB  196,226,89,43,230                   ; vpackusdw     %xmm6,%xmm4,%xmm4
+  DB  196,227,125,25,198,1                ; vextractf128  $0x1,%ymm0,%xmm6
+  DB  196,226,121,43,198                  ; vpackusdw     %xmm6,%xmm0,%xmm0
+  DB  197,241,97,245                      ; vpunpcklwd    %xmm5,%xmm1,%xmm6
+  DB  197,241,105,205                     ; vpunpckhwd    %xmm5,%xmm1,%xmm1
+  DB  197,217,97,232                      ; vpunpcklwd    %xmm0,%xmm4,%xmm5
+  DB  197,217,105,192                     ; vpunpckhwd    %xmm0,%xmm4,%xmm0
+  DB  197,73,98,221                       ; vpunpckldq    %xmm5,%xmm6,%xmm11
+  DB  197,73,106,213                      ; vpunpckhdq    %xmm5,%xmm6,%xmm10
+  DB  197,113,98,200                      ; vpunpckldq    %xmm0,%xmm1,%xmm9
+  DB  197,113,106,192                     ; vpunpckhdq    %xmm0,%xmm1,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,31                              ; jne           4704 <_sk_store_f16_avx+0xd2>
+  DB  117,79                              ; jne           4b69 <_sk_store_f16_avx+0x24f>
   DB  196,65,120,17,28,248                ; vmovups       %xmm11,(%r8,%rdi,8)
   DB  196,65,120,17,84,248,16             ; vmovups       %xmm10,0x10(%r8,%rdi,8)
   DB  196,65,120,17,76,248,32             ; vmovups       %xmm9,0x20(%r8,%rdi,8)
   DB  196,65,122,127,68,248,48            ; vmovdqu       %xmm8,0x30(%r8,%rdi,8)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  197,252,16,4,36                     ; vmovups       (%rsp),%ymm0
+  DB  197,252,16,76,36,32                 ; vmovups       0x20(%rsp),%ymm1
+  DB  197,252,16,100,36,64                ; vmovups       0x40(%rsp),%ymm4
+  DB  197,252,16,108,36,96                ; vmovups       0x60(%rsp),%ymm5
+  DB  197,252,16,180,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm6
+  DB  197,252,16,188,36,160,0,0,0         ; vmovups       0xa0(%rsp),%ymm7
+  DB  72,129,196,216,0,0,0                ; add           $0xd8,%rsp
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,214,28,248               ; vmovq         %xmm11,(%r8,%rdi,8)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            4700 <_sk_store_f16_avx+0xce>
+  DB  116,192                             ; je            4b35 <_sk_store_f16_avx+0x21b>
   DB  196,65,121,23,92,248,8              ; vmovhpd       %xmm11,0x8(%r8,%rdi,8)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            4700 <_sk_store_f16_avx+0xce>
+  DB  114,179                             ; jb            4b35 <_sk_store_f16_avx+0x21b>
   DB  196,65,121,214,84,248,16            ; vmovq         %xmm10,0x10(%r8,%rdi,8)
-  DB  116,218                             ; je            4700 <_sk_store_f16_avx+0xce>
+  DB  116,170                             ; je            4b35 <_sk_store_f16_avx+0x21b>
   DB  196,65,121,23,84,248,24             ; vmovhpd       %xmm10,0x18(%r8,%rdi,8)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            4700 <_sk_store_f16_avx+0xce>
+  DB  114,157                             ; jb            4b35 <_sk_store_f16_avx+0x21b>
   DB  196,65,121,214,76,248,32            ; vmovq         %xmm9,0x20(%r8,%rdi,8)
-  DB  116,196                             ; je            4700 <_sk_store_f16_avx+0xce>
+  DB  116,148                             ; je            4b35 <_sk_store_f16_avx+0x21b>
   DB  196,65,121,23,76,248,40             ; vmovhpd       %xmm9,0x28(%r8,%rdi,8)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,183                             ; jb            4700 <_sk_store_f16_avx+0xce>
+  DB  114,135                             ; jb            4b35 <_sk_store_f16_avx+0x21b>
   DB  196,65,121,214,68,248,48            ; vmovq         %xmm8,0x30(%r8,%rdi,8)
-  DB  235,174                             ; jmp           4700 <_sk_store_f16_avx+0xce>
+  DB  233,123,255,255,255                 ; jmpq          4b35 <_sk_store_f16_avx+0x21b>
 
 PUBLIC _sk_load_u16_be_avx
 _sk_load_u16_be_avx LABEL PROC
@@ -8586,7 +8796,7 @@
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  72,141,4,189,0,0,0,0                ; lea           0x0(,%rdi,4),%rax
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,5,1,0,0                      ; jne           486d <_sk_load_u16_be_avx+0x11b>
+  DB  15,133,5,1,0,0                      ; jne           4cd5 <_sk_load_u16_be_avx+0x11b>
   DB  196,65,121,16,4,64                  ; vmovupd       (%r8,%rax,2),%xmm8
   DB  196,193,121,16,84,64,16             ; vmovupd       0x10(%r8,%rax,2),%xmm2
   DB  196,193,121,16,92,64,32             ; vmovupd       0x20(%r8,%rax,2),%xmm3
@@ -8645,29 +8855,29 @@
   DB  196,65,123,16,4,64                  ; vmovsd        (%r8,%rax,2),%xmm8
   DB  196,65,49,239,201                   ; vpxor         %xmm9,%xmm9,%xmm9
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,85                              ; je            48d3 <_sk_load_u16_be_avx+0x181>
+  DB  116,85                              ; je            4d3b <_sk_load_u16_be_avx+0x181>
   DB  196,65,57,22,68,64,8                ; vmovhpd       0x8(%r8,%rax,2),%xmm8,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,72                              ; jb            48d3 <_sk_load_u16_be_avx+0x181>
+  DB  114,72                              ; jb            4d3b <_sk_load_u16_be_avx+0x181>
   DB  196,193,123,16,84,64,16             ; vmovsd        0x10(%r8,%rax,2),%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  116,72                              ; je            48e0 <_sk_load_u16_be_avx+0x18e>
+  DB  116,72                              ; je            4d48 <_sk_load_u16_be_avx+0x18e>
   DB  196,193,105,22,84,64,24             ; vmovhpd       0x18(%r8,%rax,2),%xmm2,%xmm2
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,59                              ; jb            48e0 <_sk_load_u16_be_avx+0x18e>
+  DB  114,59                              ; jb            4d48 <_sk_load_u16_be_avx+0x18e>
   DB  196,193,123,16,92,64,32             ; vmovsd        0x20(%r8,%rax,2),%xmm3
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  15,132,205,254,255,255              ; je            4783 <_sk_load_u16_be_avx+0x31>
+  DB  15,132,205,254,255,255              ; je            4beb <_sk_load_u16_be_avx+0x31>
   DB  196,193,97,22,92,64,40              ; vmovhpd       0x28(%r8,%rax,2),%xmm3,%xmm3
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,188,254,255,255              ; jb            4783 <_sk_load_u16_be_avx+0x31>
+  DB  15,130,188,254,255,255              ; jb            4beb <_sk_load_u16_be_avx+0x31>
   DB  196,65,122,126,76,64,48             ; vmovq         0x30(%r8,%rax,2),%xmm9
-  DB  233,176,254,255,255                 ; jmpq          4783 <_sk_load_u16_be_avx+0x31>
+  DB  233,176,254,255,255                 ; jmpq          4beb <_sk_load_u16_be_avx+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
   DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,163,254,255,255                 ; jmpq          4783 <_sk_load_u16_be_avx+0x31>
+  DB  233,163,254,255,255                 ; jmpq          4beb <_sk_load_u16_be_avx+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,154,254,255,255                 ; jmpq          4783 <_sk_load_u16_be_avx+0x31>
+  DB  233,154,254,255,255                 ; jmpq          4beb <_sk_load_u16_be_avx+0x31>
 
 PUBLIC _sk_load_rgb_u16_be_avx
 _sk_load_rgb_u16_be_avx LABEL PROC
@@ -8675,7 +8885,7 @@
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  72,141,4,127                        ; lea           (%rdi,%rdi,2),%rax
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,8,1,0,0                      ; jne           4a03 <_sk_load_rgb_u16_be_avx+0x11a>
+  DB  15,133,8,1,0,0                      ; jne           4e6b <_sk_load_rgb_u16_be_avx+0x11a>
   DB  196,193,122,111,4,64                ; vmovdqu       (%r8,%rax,2),%xmm0
   DB  196,193,122,111,84,64,12            ; vmovdqu       0xc(%r8,%rax,2),%xmm2
   DB  196,193,122,111,76,64,24            ; vmovdqu       0x18(%r8,%rax,2),%xmm1
@@ -8734,36 +8944,36 @@
   DB  196,193,121,110,4,64                ; vmovd         (%r8,%rax,2),%xmm0
   DB  196,193,121,196,68,64,4,2           ; vpinsrw       $0x2,0x4(%r8,%rax,2),%xmm0,%xmm0
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  117,5                               ; jne           4a1c <_sk_load_rgb_u16_be_avx+0x133>
-  DB  233,19,255,255,255                  ; jmpq          492f <_sk_load_rgb_u16_be_avx+0x46>
+  DB  117,5                               ; jne           4e84 <_sk_load_rgb_u16_be_avx+0x133>
+  DB  233,19,255,255,255                  ; jmpq          4d97 <_sk_load_rgb_u16_be_avx+0x46>
   DB  196,193,121,110,76,64,6             ; vmovd         0x6(%r8,%rax,2),%xmm1
   DB  196,65,113,196,68,64,10,2           ; vpinsrw       $0x2,0xa(%r8,%rax,2),%xmm1,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,26                              ; jb            4a4b <_sk_load_rgb_u16_be_avx+0x162>
+  DB  114,26                              ; jb            4eb3 <_sk_load_rgb_u16_be_avx+0x162>
   DB  196,193,121,110,76,64,12            ; vmovd         0xc(%r8,%rax,2),%xmm1
   DB  196,193,113,196,84,64,16,2          ; vpinsrw       $0x2,0x10(%r8,%rax,2),%xmm1,%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  117,10                              ; jne           4a50 <_sk_load_rgb_u16_be_avx+0x167>
-  DB  233,228,254,255,255                 ; jmpq          492f <_sk_load_rgb_u16_be_avx+0x46>
-  DB  233,223,254,255,255                 ; jmpq          492f <_sk_load_rgb_u16_be_avx+0x46>
+  DB  117,10                              ; jne           4eb8 <_sk_load_rgb_u16_be_avx+0x167>
+  DB  233,228,254,255,255                 ; jmpq          4d97 <_sk_load_rgb_u16_be_avx+0x46>
+  DB  233,223,254,255,255                 ; jmpq          4d97 <_sk_load_rgb_u16_be_avx+0x46>
   DB  196,193,121,110,76,64,18            ; vmovd         0x12(%r8,%rax,2),%xmm1
   DB  196,65,113,196,76,64,22,2           ; vpinsrw       $0x2,0x16(%r8,%rax,2),%xmm1,%xmm9
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,26                              ; jb            4a7f <_sk_load_rgb_u16_be_avx+0x196>
+  DB  114,26                              ; jb            4ee7 <_sk_load_rgb_u16_be_avx+0x196>
   DB  196,193,121,110,76,64,24            ; vmovd         0x18(%r8,%rax,2),%xmm1
   DB  196,193,113,196,76,64,28,2          ; vpinsrw       $0x2,0x1c(%r8,%rax,2),%xmm1,%xmm1
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  117,10                              ; jne           4a84 <_sk_load_rgb_u16_be_avx+0x19b>
-  DB  233,176,254,255,255                 ; jmpq          492f <_sk_load_rgb_u16_be_avx+0x46>
-  DB  233,171,254,255,255                 ; jmpq          492f <_sk_load_rgb_u16_be_avx+0x46>
+  DB  117,10                              ; jne           4eec <_sk_load_rgb_u16_be_avx+0x19b>
+  DB  233,176,254,255,255                 ; jmpq          4d97 <_sk_load_rgb_u16_be_avx+0x46>
+  DB  233,171,254,255,255                 ; jmpq          4d97 <_sk_load_rgb_u16_be_avx+0x46>
   DB  196,193,121,110,92,64,30            ; vmovd         0x1e(%r8,%rax,2),%xmm3
   DB  196,65,97,196,92,64,34,2            ; vpinsrw       $0x2,0x22(%r8,%rax,2),%xmm3,%xmm11
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,20                              ; jb            4aad <_sk_load_rgb_u16_be_avx+0x1c4>
+  DB  114,20                              ; jb            4f15 <_sk_load_rgb_u16_be_avx+0x1c4>
   DB  196,193,121,110,92,64,36            ; vmovd         0x24(%r8,%rax,2),%xmm3
   DB  196,193,97,196,92,64,40,2           ; vpinsrw       $0x2,0x28(%r8,%rax,2),%xmm3,%xmm3
-  DB  233,130,254,255,255                 ; jmpq          492f <_sk_load_rgb_u16_be_avx+0x46>
-  DB  233,125,254,255,255                 ; jmpq          492f <_sk_load_rgb_u16_be_avx+0x46>
+  DB  233,130,254,255,255                 ; jmpq          4d97 <_sk_load_rgb_u16_be_avx+0x46>
+  DB  233,125,254,255,255                 ; jmpq          4d97 <_sk_load_rgb_u16_be_avx+0x46>
 
 PUBLIC _sk_store_u16_be_avx
 _sk_store_u16_be_avx LABEL PROC
@@ -8811,7 +9021,7 @@
   DB  196,65,17,98,200                    ; vpunpckldq    %xmm8,%xmm13,%xmm9
   DB  196,65,17,106,192                   ; vpunpckhdq    %xmm8,%xmm13,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,31                              ; jne           4bb4 <_sk_store_u16_be_avx+0x102>
+  DB  117,31                              ; jne           501c <_sk_store_u16_be_avx+0x102>
   DB  196,1,120,17,28,72                  ; vmovups       %xmm11,(%r8,%r9,2)
   DB  196,1,120,17,84,72,16               ; vmovups       %xmm10,0x10(%r8,%r9,2)
   DB  196,1,120,17,76,72,32               ; vmovups       %xmm9,0x20(%r8,%r9,2)
@@ -8820,31 +9030,31 @@
   DB  255,224                             ; jmpq          *%rax
   DB  196,1,121,214,28,72                 ; vmovq         %xmm11,(%r8,%r9,2)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            4bb0 <_sk_store_u16_be_avx+0xfe>
+  DB  116,240                             ; je            5018 <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,23,92,72,8                ; vmovhpd       %xmm11,0x8(%r8,%r9,2)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            4bb0 <_sk_store_u16_be_avx+0xfe>
+  DB  114,227                             ; jb            5018 <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,214,84,72,16              ; vmovq         %xmm10,0x10(%r8,%r9,2)
-  DB  116,218                             ; je            4bb0 <_sk_store_u16_be_avx+0xfe>
+  DB  116,218                             ; je            5018 <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,23,84,72,24               ; vmovhpd       %xmm10,0x18(%r8,%r9,2)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            4bb0 <_sk_store_u16_be_avx+0xfe>
+  DB  114,205                             ; jb            5018 <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,214,76,72,32              ; vmovq         %xmm9,0x20(%r8,%r9,2)
-  DB  116,196                             ; je            4bb0 <_sk_store_u16_be_avx+0xfe>
+  DB  116,196                             ; je            5018 <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,23,76,72,40               ; vmovhpd       %xmm9,0x28(%r8,%r9,2)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,183                             ; jb            4bb0 <_sk_store_u16_be_avx+0xfe>
+  DB  114,183                             ; jb            5018 <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,214,68,72,48              ; vmovq         %xmm8,0x30(%r8,%r9,2)
-  DB  235,174                             ; jmp           4bb0 <_sk_store_u16_be_avx+0xfe>
+  DB  235,174                             ; jmp           5018 <_sk_store_u16_be_avx+0xfe>
 
 PUBLIC _sk_load_f32_avx
 _sk_load_f32_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  119,110                             ; ja            4c78 <_sk_load_f32_avx+0x76>
+  DB  119,110                             ; ja            50e0 <_sk_load_f32_avx+0x76>
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
-  DB  76,141,21,132,0,0,0                 ; lea           0x84(%rip),%r10        # 4ca0 <_sk_load_f32_avx+0x9e>
+  DB  76,141,21,132,0,0,0                 ; lea           0x84(%rip),%r10        # 5108 <_sk_load_f32_avx+0x9e>
   DB  73,99,4,138                         ; movslq        (%r10,%rcx,4),%rax
   DB  76,1,208                            ; add           %r10,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -8901,7 +9111,7 @@
   DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
   DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,55                              ; jne           4d2d <_sk_store_f32_avx+0x6d>
+  DB  117,55                              ; jne           5195 <_sk_store_f32_avx+0x6d>
   DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -8914,22 +9124,22 @@
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            4d29 <_sk_store_f32_avx+0x69>
+  DB  116,240                             ; je            5191 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            4d29 <_sk_store_f32_avx+0x69>
+  DB  114,227                             ; jb            5191 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
-  DB  116,218                             ; je            4d29 <_sk_store_f32_avx+0x69>
+  DB  116,218                             ; je            5191 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            4d29 <_sk_store_f32_avx+0x69>
+  DB  114,205                             ; jb            5191 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  DB  116,195                             ; je            4d29 <_sk_store_f32_avx+0x69>
+  DB  116,195                             ; je            5191 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,181                             ; jb            4d29 <_sk_store_f32_avx+0x69>
+  DB  114,181                             ; jb            5191 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  DB  235,171                             ; jmp           4d29 <_sk_store_f32_avx+0x69>
+  DB  235,171                             ; jmp           5191 <_sk_store_f32_avx+0x69>
 
 PUBLIC _sk_clamp_x_avx
 _sk_clamp_x_avx LABEL PROC
@@ -9233,7 +9443,7 @@
   DB  196,226,125,24,88,28                ; vbroadcastss  0x1c(%rax),%ymm3
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  77,133,192                          ; test          %r8,%r8
-  DB  15,132,146,0,0,0                    ; je            52e1 <_sk_linear_gradient_avx+0xb8>
+  DB  15,132,146,0,0,0                    ; je            5749 <_sk_linear_gradient_avx+0xb8>
   DB  72,139,64,8                         ; mov           0x8(%rax),%rax
   DB  72,131,192,32                       ; add           $0x20,%rax
   DB  196,65,28,87,228                    ; vxorps        %ymm12,%ymm12,%ymm12
@@ -9260,8 +9470,8 @@
   DB  196,227,13,74,219,208               ; vblendvps     %ymm13,%ymm3,%ymm14,%ymm3
   DB  72,131,192,36                       ; add           $0x24,%rax
   DB  73,255,200                          ; dec           %r8
-  DB  117,140                             ; jne           526b <_sk_linear_gradient_avx+0x42>
-  DB  235,20                              ; jmp           52f5 <_sk_linear_gradient_avx+0xcc>
+  DB  117,140                             ; jne           56d3 <_sk_linear_gradient_avx+0x42>
+  DB  235,20                              ; jmp           575d <_sk_linear_gradient_avx+0xcc>
   DB  196,65,36,87,219                    ; vxorps        %ymm11,%ymm11,%ymm11
   DB  196,65,44,87,210                    ; vxorps        %ymm10,%ymm10,%ymm10
   DB  196,65,52,87,201                    ; vxorps        %ymm9,%ymm9,%ymm9
@@ -12754,43 +12964,70 @@
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  243,15,111,4,248                    ; movdqu        (%rax,%rdi,8),%xmm0
   DB  243,15,111,76,248,16                ; movdqu        0x10(%rax,%rdi,8),%xmm1
-  DB  102,68,15,111,192                   ; movdqa        %xmm0,%xmm8
-  DB  102,68,15,97,193                    ; punpcklwd     %xmm1,%xmm8
+  DB  102,68,15,111,200                   ; movdqa        %xmm0,%xmm9
+  DB  102,68,15,97,201                    ; punpcklwd     %xmm1,%xmm9
   DB  102,15,105,193                      ; punpckhwd     %xmm1,%xmm0
-  DB  102,65,15,111,200                   ; movdqa        %xmm8,%xmm1
-  DB  102,15,97,200                       ; punpcklwd     %xmm0,%xmm1
-  DB  102,68,15,105,192                   ; punpckhwd     %xmm0,%xmm8
-  DB  184,0,4,0,4                         ; mov           $0x4000400,%eax
+  DB  102,69,15,111,225                   ; movdqa        %xmm9,%xmm12
+  DB  102,68,15,97,224                    ; punpcklwd     %xmm0,%xmm12
+  DB  102,68,15,105,200                   ; punpckhwd     %xmm0,%xmm9
+  DB  102,69,15,56,51,236                 ; pmovzxwd      %xmm12,%xmm13
+  DB  184,0,128,0,0                       ; mov           $0x8000,%eax
+  DB  102,15,110,192                      ; movd          %eax,%xmm0
+  DB  102,68,15,112,192,0                 ; pshufd        $0x0,%xmm0,%xmm8
+  DB  102,65,15,111,213                   ; movdqa        %xmm13,%xmm2
+  DB  102,65,15,219,208                   ; pand          %xmm8,%xmm2
+  DB  184,0,124,0,0                       ; mov           $0x7c00,%eax
   DB  102,15,110,192                      ; movd          %eax,%xmm0
   DB  102,15,112,216,0                    ; pshufd        $0x0,%xmm0,%xmm3
-  DB  102,15,111,195                      ; movdqa        %xmm3,%xmm0
-  DB  102,15,101,193                      ; pcmpgtw       %xmm1,%xmm0
-  DB  102,15,223,193                      ; pandn         %xmm1,%xmm0
-  DB  102,15,56,51,192                    ; pmovzxwd      %xmm0,%xmm0
-  DB  102,15,114,240,13                   ; pslld         $0xd,%xmm0
-  DB  184,0,0,128,119                     ; mov           $0x77800000,%eax
-  DB  102,15,110,208                      ; movd          %eax,%xmm2
-  DB  102,68,15,112,202,0                 ; pshufd        $0x0,%xmm2,%xmm9
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  102,15,112,201,78                   ; pshufd        $0x4e,%xmm1,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,15,101,209                      ; pcmpgtw       %xmm1,%xmm2
-  DB  102,15,223,209                      ; pandn         %xmm1,%xmm2
-  DB  102,15,56,51,202                    ; pmovzxwd      %xmm2,%xmm1
-  DB  102,15,114,241,13                   ; pslld         $0xd,%xmm1
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,65,15,101,208                   ; pcmpgtw       %xmm8,%xmm2
-  DB  102,65,15,223,208                   ; pandn         %xmm8,%xmm2
-  DB  102,15,56,51,210                    ; pmovzxwd      %xmm2,%xmm2
-  DB  102,15,114,242,13                   ; pslld         $0xd,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  102,69,15,112,192,78                ; pshufd        $0x4e,%xmm8,%xmm8
-  DB  102,65,15,101,216                   ; pcmpgtw       %xmm8,%xmm3
+  DB  102,65,15,111,197                   ; movdqa        %xmm13,%xmm0
+  DB  102,15,219,195                      ; pand          %xmm3,%xmm0
+  DB  102,68,15,239,234                   ; pxor          %xmm2,%xmm13
+  DB  102,69,15,239,210                   ; pxor          %xmm10,%xmm10
+  DB  102,15,114,242,16                   ; pslld         $0x10,%xmm2
+  DB  102,65,15,114,245,13                ; pslld         $0xd,%xmm13
+  DB  184,0,0,0,56                        ; mov           $0x38000000,%eax
+  DB  102,15,110,200                      ; movd          %eax,%xmm1
+  DB  102,68,15,112,217,0                 ; pshufd        $0x0,%xmm1,%xmm11
+  DB  102,65,15,254,211                   ; paddd         %xmm11,%xmm2
+  DB  102,65,15,254,213                   ; paddd         %xmm13,%xmm2
+  DB  102,65,15,118,194                   ; pcmpeqd       %xmm10,%xmm0
+  DB  102,15,223,194                      ; pandn         %xmm2,%xmm0
+  DB  102,65,15,115,220,8                 ; psrldq        $0x8,%xmm12
+  DB  102,69,15,56,51,228                 ; pmovzxwd      %xmm12,%xmm12
+  DB  102,65,15,111,212                   ; movdqa        %xmm12,%xmm2
+  DB  102,65,15,219,208                   ; pand          %xmm8,%xmm2
+  DB  102,65,15,111,204                   ; movdqa        %xmm12,%xmm1
+  DB  102,15,219,203                      ; pand          %xmm3,%xmm1
+  DB  102,68,15,239,226                   ; pxor          %xmm2,%xmm12
+  DB  102,15,114,242,16                   ; pslld         $0x10,%xmm2
+  DB  102,65,15,114,244,13                ; pslld         $0xd,%xmm12
+  DB  102,65,15,254,211                   ; paddd         %xmm11,%xmm2
+  DB  102,65,15,254,212                   ; paddd         %xmm12,%xmm2
+  DB  102,65,15,118,202                   ; pcmpeqd       %xmm10,%xmm1
+  DB  102,15,223,202                      ; pandn         %xmm2,%xmm1
+  DB  102,69,15,56,51,225                 ; pmovzxwd      %xmm9,%xmm12
+  DB  102,69,15,111,236                   ; movdqa        %xmm12,%xmm13
+  DB  102,69,15,219,232                   ; pand          %xmm8,%xmm13
+  DB  102,65,15,111,212                   ; movdqa        %xmm12,%xmm2
+  DB  102,15,219,211                      ; pand          %xmm3,%xmm2
+  DB  102,69,15,239,229                   ; pxor          %xmm13,%xmm12
+  DB  102,65,15,114,245,16                ; pslld         $0x10,%xmm13
+  DB  102,65,15,114,244,13                ; pslld         $0xd,%xmm12
+  DB  102,69,15,254,235                   ; paddd         %xmm11,%xmm13
+  DB  102,69,15,254,236                   ; paddd         %xmm12,%xmm13
+  DB  102,65,15,118,210                   ; pcmpeqd       %xmm10,%xmm2
+  DB  102,65,15,223,213                   ; pandn         %xmm13,%xmm2
+  DB  102,65,15,115,217,8                 ; psrldq        $0x8,%xmm9
+  DB  102,69,15,56,51,201                 ; pmovzxwd      %xmm9,%xmm9
+  DB  102,69,15,219,193                   ; pand          %xmm9,%xmm8
+  DB  102,65,15,219,217                   ; pand          %xmm9,%xmm3
+  DB  102,69,15,239,200                   ; pxor          %xmm8,%xmm9
+  DB  102,65,15,114,240,16                ; pslld         $0x10,%xmm8
+  DB  102,65,15,114,241,13                ; pslld         $0xd,%xmm9
+  DB  102,69,15,254,195                   ; paddd         %xmm11,%xmm8
+  DB  102,69,15,254,193                   ; paddd         %xmm9,%xmm8
+  DB  102,65,15,118,218                   ; pcmpeqd       %xmm10,%xmm3
   DB  102,65,15,223,216                   ; pandn         %xmm8,%xmm3
-  DB  102,15,56,51,219                    ; pmovzxwd      %xmm3,%xmm3
-  DB  102,15,114,243,13                   ; pslld         $0xd,%xmm3
-  DB  65,15,89,217                        ; mulps         %xmm9,%xmm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
@@ -12816,43 +13053,70 @@
   DB  243,65,15,126,4,193                 ; movq          (%r9,%rax,8),%xmm0
   DB  243,67,15,126,20,193                ; movq          (%r9,%r8,8),%xmm2
   DB  102,15,108,208                      ; punpcklqdq    %xmm0,%xmm2
-  DB  102,68,15,111,194                   ; movdqa        %xmm2,%xmm8
-  DB  102,68,15,97,193                    ; punpcklwd     %xmm1,%xmm8
+  DB  102,68,15,111,202                   ; movdqa        %xmm2,%xmm9
+  DB  102,68,15,97,201                    ; punpcklwd     %xmm1,%xmm9
   DB  102,15,105,209                      ; punpckhwd     %xmm1,%xmm2
-  DB  102,65,15,111,200                   ; movdqa        %xmm8,%xmm1
-  DB  102,15,97,202                       ; punpcklwd     %xmm2,%xmm1
-  DB  102,68,15,105,194                   ; punpckhwd     %xmm2,%xmm8
-  DB  184,0,4,0,4                         ; mov           $0x4000400,%eax
+  DB  102,69,15,111,225                   ; movdqa        %xmm9,%xmm12
+  DB  102,68,15,97,226                    ; punpcklwd     %xmm2,%xmm12
+  DB  102,68,15,105,202                   ; punpckhwd     %xmm2,%xmm9
+  DB  102,69,15,56,51,236                 ; pmovzxwd      %xmm12,%xmm13
+  DB  184,0,128,0,0                       ; mov           $0x8000,%eax
+  DB  102,15,110,192                      ; movd          %eax,%xmm0
+  DB  102,68,15,112,192,0                 ; pshufd        $0x0,%xmm0,%xmm8
+  DB  102,65,15,111,213                   ; movdqa        %xmm13,%xmm2
+  DB  102,65,15,219,208                   ; pand          %xmm8,%xmm2
+  DB  184,0,124,0,0                       ; mov           $0x7c00,%eax
   DB  102,15,110,192                      ; movd          %eax,%xmm0
   DB  102,15,112,216,0                    ; pshufd        $0x0,%xmm0,%xmm3
-  DB  102,15,111,195                      ; movdqa        %xmm3,%xmm0
-  DB  102,15,101,193                      ; pcmpgtw       %xmm1,%xmm0
-  DB  102,15,223,193                      ; pandn         %xmm1,%xmm0
-  DB  102,15,56,51,192                    ; pmovzxwd      %xmm0,%xmm0
-  DB  102,15,114,240,13                   ; pslld         $0xd,%xmm0
-  DB  184,0,0,128,119                     ; mov           $0x77800000,%eax
-  DB  102,15,110,208                      ; movd          %eax,%xmm2
-  DB  102,68,15,112,202,0                 ; pshufd        $0x0,%xmm2,%xmm9
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  102,15,112,201,78                   ; pshufd        $0x4e,%xmm1,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,15,101,209                      ; pcmpgtw       %xmm1,%xmm2
-  DB  102,15,223,209                      ; pandn         %xmm1,%xmm2
-  DB  102,15,56,51,202                    ; pmovzxwd      %xmm2,%xmm1
-  DB  102,15,114,241,13                   ; pslld         $0xd,%xmm1
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,65,15,101,208                   ; pcmpgtw       %xmm8,%xmm2
-  DB  102,65,15,223,208                   ; pandn         %xmm8,%xmm2
-  DB  102,15,56,51,210                    ; pmovzxwd      %xmm2,%xmm2
-  DB  102,15,114,242,13                   ; pslld         $0xd,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  102,69,15,112,192,78                ; pshufd        $0x4e,%xmm8,%xmm8
-  DB  102,65,15,101,216                   ; pcmpgtw       %xmm8,%xmm3
+  DB  102,65,15,111,197                   ; movdqa        %xmm13,%xmm0
+  DB  102,15,219,195                      ; pand          %xmm3,%xmm0
+  DB  102,68,15,239,234                   ; pxor          %xmm2,%xmm13
+  DB  102,69,15,239,210                   ; pxor          %xmm10,%xmm10
+  DB  102,15,114,242,16                   ; pslld         $0x10,%xmm2
+  DB  102,65,15,114,245,13                ; pslld         $0xd,%xmm13
+  DB  184,0,0,0,56                        ; mov           $0x38000000,%eax
+  DB  102,15,110,200                      ; movd          %eax,%xmm1
+  DB  102,68,15,112,217,0                 ; pshufd        $0x0,%xmm1,%xmm11
+  DB  102,65,15,254,211                   ; paddd         %xmm11,%xmm2
+  DB  102,65,15,254,213                   ; paddd         %xmm13,%xmm2
+  DB  102,65,15,118,194                   ; pcmpeqd       %xmm10,%xmm0
+  DB  102,15,223,194                      ; pandn         %xmm2,%xmm0
+  DB  102,65,15,115,220,8                 ; psrldq        $0x8,%xmm12
+  DB  102,69,15,56,51,228                 ; pmovzxwd      %xmm12,%xmm12
+  DB  102,65,15,111,212                   ; movdqa        %xmm12,%xmm2
+  DB  102,65,15,219,208                   ; pand          %xmm8,%xmm2
+  DB  102,65,15,111,204                   ; movdqa        %xmm12,%xmm1
+  DB  102,15,219,203                      ; pand          %xmm3,%xmm1
+  DB  102,68,15,239,226                   ; pxor          %xmm2,%xmm12
+  DB  102,15,114,242,16                   ; pslld         $0x10,%xmm2
+  DB  102,65,15,114,244,13                ; pslld         $0xd,%xmm12
+  DB  102,65,15,254,211                   ; paddd         %xmm11,%xmm2
+  DB  102,65,15,254,212                   ; paddd         %xmm12,%xmm2
+  DB  102,65,15,118,202                   ; pcmpeqd       %xmm10,%xmm1
+  DB  102,15,223,202                      ; pandn         %xmm2,%xmm1
+  DB  102,69,15,56,51,225                 ; pmovzxwd      %xmm9,%xmm12
+  DB  102,69,15,111,236                   ; movdqa        %xmm12,%xmm13
+  DB  102,69,15,219,232                   ; pand          %xmm8,%xmm13
+  DB  102,65,15,111,212                   ; movdqa        %xmm12,%xmm2
+  DB  102,15,219,211                      ; pand          %xmm3,%xmm2
+  DB  102,69,15,239,229                   ; pxor          %xmm13,%xmm12
+  DB  102,65,15,114,245,16                ; pslld         $0x10,%xmm13
+  DB  102,65,15,114,244,13                ; pslld         $0xd,%xmm12
+  DB  102,69,15,254,235                   ; paddd         %xmm11,%xmm13
+  DB  102,69,15,254,236                   ; paddd         %xmm12,%xmm13
+  DB  102,65,15,118,210                   ; pcmpeqd       %xmm10,%xmm2
+  DB  102,65,15,223,213                   ; pandn         %xmm13,%xmm2
+  DB  102,65,15,115,217,8                 ; psrldq        $0x8,%xmm9
+  DB  102,69,15,56,51,201                 ; pmovzxwd      %xmm9,%xmm9
+  DB  102,69,15,219,193                   ; pand          %xmm9,%xmm8
+  DB  102,65,15,219,217                   ; pand          %xmm9,%xmm3
+  DB  102,69,15,239,200                   ; pxor          %xmm8,%xmm9
+  DB  102,65,15,114,240,16                ; pslld         $0x10,%xmm8
+  DB  102,65,15,114,241,13                ; pslld         $0xd,%xmm9
+  DB  102,69,15,254,195                   ; paddd         %xmm11,%xmm8
+  DB  102,69,15,254,193                   ; paddd         %xmm9,%xmm8
+  DB  102,65,15,118,218                   ; pcmpeqd       %xmm10,%xmm3
   DB  102,65,15,223,216                   ; pandn         %xmm8,%xmm3
-  DB  102,15,56,51,219                    ; pmovzxwd      %xmm3,%xmm3
-  DB  102,15,114,243,13                   ; pslld         $0xd,%xmm3
-  DB  65,15,89,217                        ; mulps         %xmm9,%xmm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
@@ -12860,30 +13124,68 @@
 _sk_store_f16_sse41 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  72,139,0                            ; mov           (%rax),%rax
-  DB  185,0,0,128,7                       ; mov           $0x7800000,%ecx
+  DB  185,0,0,0,128                       ; mov           $0x80000000,%ecx
   DB  102,68,15,110,193                   ; movd          %ecx,%xmm8
   DB  102,69,15,112,200,0                 ; pshufd        $0x0,%xmm8,%xmm9
-  DB  102,69,15,111,193                   ; movdqa        %xmm9,%xmm8
-  DB  68,15,89,192                        ; mulps         %xmm0,%xmm8
-  DB  102,65,15,114,208,13                ; psrld         $0xd,%xmm8
+  DB  102,69,15,111,225                   ; movdqa        %xmm9,%xmm12
+  DB  102,68,15,219,224                   ; pand          %xmm0,%xmm12
+  DB  102,68,15,111,192                   ; movdqa        %xmm0,%xmm8
+  DB  102,69,15,239,196                   ; pxor          %xmm12,%xmm8
+  DB  185,0,0,128,56                      ; mov           $0x38800000,%ecx
+  DB  102,68,15,110,209                   ; movd          %ecx,%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  102,65,15,114,212,16                ; psrld         $0x10,%xmm12
+  DB  102,69,15,111,232                   ; movdqa        %xmm8,%xmm13
+  DB  102,65,15,114,213,13                ; psrld         $0xd,%xmm13
+  DB  185,0,192,1,0                       ; mov           $0x1c000,%ecx
+  DB  102,68,15,110,217                   ; movd          %ecx,%xmm11
+  DB  102,69,15,112,219,0                 ; pshufd        $0x0,%xmm11,%xmm11
+  DB  102,69,15,250,227                   ; psubd         %xmm11,%xmm12
+  DB  102,69,15,254,229                   ; paddd         %xmm13,%xmm12
+  DB  69,15,194,194,5                     ; cmpnltps      %xmm10,%xmm8
+  DB  69,15,84,196                        ; andps         %xmm12,%xmm8
   DB  102,69,15,56,43,192                 ; packusdw      %xmm8,%xmm8
-  DB  102,69,15,111,209                   ; movdqa        %xmm9,%xmm10
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  102,65,15,114,210,13                ; psrld         $0xd,%xmm10
-  DB  102,69,15,56,43,210                 ; packusdw      %xmm10,%xmm10
-  DB  102,69,15,111,217                   ; movdqa        %xmm9,%xmm11
-  DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
-  DB  102,65,15,114,211,13                ; psrld         $0xd,%xmm11
-  DB  102,69,15,56,43,219                 ; packusdw      %xmm11,%xmm11
-  DB  68,15,89,203                        ; mulps         %xmm3,%xmm9
-  DB  102,65,15,114,209,13                ; psrld         $0xd,%xmm9
-  DB  102,69,15,56,43,201                 ; packusdw      %xmm9,%xmm9
-  DB  102,69,15,97,194                    ; punpcklwd     %xmm10,%xmm8
-  DB  102,69,15,97,217                    ; punpcklwd     %xmm9,%xmm11
+  DB  102,69,15,111,233                   ; movdqa        %xmm9,%xmm13
+  DB  102,68,15,219,233                   ; pand          %xmm1,%xmm13
+  DB  102,68,15,111,225                   ; movdqa        %xmm1,%xmm12
+  DB  102,69,15,239,229                   ; pxor          %xmm13,%xmm12
+  DB  102,65,15,114,213,16                ; psrld         $0x10,%xmm13
+  DB  102,69,15,111,244                   ; movdqa        %xmm12,%xmm14
+  DB  102,65,15,114,214,13                ; psrld         $0xd,%xmm14
+  DB  102,69,15,250,235                   ; psubd         %xmm11,%xmm13
+  DB  102,69,15,254,238                   ; paddd         %xmm14,%xmm13
+  DB  69,15,194,226,5                     ; cmpnltps      %xmm10,%xmm12
+  DB  69,15,84,229                        ; andps         %xmm13,%xmm12
+  DB  102,69,15,56,43,228                 ; packusdw      %xmm12,%xmm12
+  DB  102,69,15,111,241                   ; movdqa        %xmm9,%xmm14
+  DB  102,68,15,219,242                   ; pand          %xmm2,%xmm14
+  DB  102,68,15,111,234                   ; movdqa        %xmm2,%xmm13
+  DB  102,69,15,239,238                   ; pxor          %xmm14,%xmm13
+  DB  102,65,15,114,214,16                ; psrld         $0x10,%xmm14
+  DB  102,69,15,111,253                   ; movdqa        %xmm13,%xmm15
+  DB  102,65,15,114,215,13                ; psrld         $0xd,%xmm15
+  DB  102,69,15,250,243                   ; psubd         %xmm11,%xmm14
+  DB  102,69,15,254,247                   ; paddd         %xmm15,%xmm14
+  DB  69,15,194,234,5                     ; cmpnltps      %xmm10,%xmm13
+  DB  69,15,84,238                        ; andps         %xmm14,%xmm13
+  DB  102,69,15,56,43,237                 ; packusdw      %xmm13,%xmm13
+  DB  102,68,15,219,203                   ; pand          %xmm3,%xmm9
+  DB  102,68,15,111,243                   ; movdqa        %xmm3,%xmm14
+  DB  102,69,15,239,241                   ; pxor          %xmm9,%xmm14
+  DB  102,65,15,114,209,16                ; psrld         $0x10,%xmm9
+  DB  102,69,15,111,254                   ; movdqa        %xmm14,%xmm15
+  DB  102,65,15,114,215,13                ; psrld         $0xd,%xmm15
+  DB  102,69,15,250,203                   ; psubd         %xmm11,%xmm9
+  DB  102,69,15,254,207                   ; paddd         %xmm15,%xmm9
+  DB  69,15,194,242,5                     ; cmpnltps      %xmm10,%xmm14
+  DB  69,15,84,241                        ; andps         %xmm9,%xmm14
+  DB  102,69,15,56,43,246                 ; packusdw      %xmm14,%xmm14
+  DB  102,69,15,97,196                    ; punpcklwd     %xmm12,%xmm8
+  DB  102,69,15,97,238                    ; punpcklwd     %xmm14,%xmm13
   DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
-  DB  102,69,15,98,203                    ; punpckldq     %xmm11,%xmm9
+  DB  102,69,15,98,205                    ; punpckldq     %xmm13,%xmm9
   DB  243,68,15,127,12,248                ; movdqu        %xmm9,(%rax,%rdi,8)
-  DB  102,69,15,106,195                   ; punpckhdq     %xmm11,%xmm8
+  DB  102,69,15,106,197                   ; punpckhdq     %xmm13,%xmm8
   DB  243,68,15,127,68,248,16             ; movdqu        %xmm8,0x10(%rax,%rdi,8)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -13428,7 +13730,7 @@
   DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
   DB  72,139,8                            ; mov           (%rax),%rcx
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,132,4,1,0,0                      ; je            3985 <_sk_linear_gradient_sse41+0x13e>
+  DB  15,132,4,1,0,0                      ; je            3b7a <_sk_linear_gradient_sse41+0x13e>
   DB  72,131,236,88                       ; sub           $0x58,%rsp
   DB  15,41,36,36                         ; movaps        %xmm4,(%rsp)
   DB  15,41,108,36,16                     ; movaps        %xmm5,0x10(%rsp)
@@ -13479,13 +13781,13 @@
   DB  15,40,196                           ; movaps        %xmm4,%xmm0
   DB  72,131,192,36                       ; add           $0x24,%rax
   DB  72,255,201                          ; dec           %rcx
-  DB  15,133,65,255,255,255               ; jne           38ad <_sk_linear_gradient_sse41+0x66>
+  DB  15,133,65,255,255,255               ; jne           3aa2 <_sk_linear_gradient_sse41+0x66>
   DB  15,40,124,36,48                     ; movaps        0x30(%rsp),%xmm7
   DB  15,40,116,36,32                     ; movaps        0x20(%rsp),%xmm6
   DB  15,40,108,36,16                     ; movaps        0x10(%rsp),%xmm5
   DB  15,40,36,36                         ; movaps        (%rsp),%xmm4
   DB  72,131,196,88                       ; add           $0x58,%rsp
-  DB  235,13                              ; jmp           3992 <_sk_linear_gradient_sse41+0x14b>
+  DB  235,13                              ; jmp           3b87 <_sk_linear_gradient_sse41+0x14b>
   DB  15,87,201                           ; xorps         %xmm1,%xmm1
   DB  15,87,210                           ; xorps         %xmm2,%xmm2
   DB  15,87,219                           ; xorps         %xmm3,%xmm3
@@ -17182,41 +17484,69 @@
   DB  102,68,15,111,192                   ; movdqa        %xmm0,%xmm8
   DB  102,68,15,97,193                    ; punpcklwd     %xmm1,%xmm8
   DB  102,15,105,193                      ; punpckhwd     %xmm1,%xmm0
-  DB  102,65,15,111,200                   ; movdqa        %xmm8,%xmm1
-  DB  102,15,97,200                       ; punpcklwd     %xmm0,%xmm1
+  DB  102,69,15,111,224                   ; movdqa        %xmm8,%xmm12
+  DB  102,68,15,97,224                    ; punpcklwd     %xmm0,%xmm12
   DB  102,68,15,105,192                   ; punpckhwd     %xmm0,%xmm8
-  DB  184,0,4,0,4                         ; mov           $0x4000400,%eax
+  DB  102,69,15,239,201                   ; pxor          %xmm9,%xmm9
+  DB  102,69,15,111,236                   ; movdqa        %xmm12,%xmm13
+  DB  102,69,15,97,233                    ; punpcklwd     %xmm9,%xmm13
+  DB  184,0,128,0,0                       ; mov           $0x8000,%eax
+  DB  102,15,110,192                      ; movd          %eax,%xmm0
+  DB  102,68,15,112,208,0                 ; pshufd        $0x0,%xmm0,%xmm10
+  DB  102,65,15,111,205                   ; movdqa        %xmm13,%xmm1
+  DB  102,65,15,219,202                   ; pand          %xmm10,%xmm1
+  DB  184,0,124,0,0                       ; mov           $0x7c00,%eax
   DB  102,15,110,192                      ; movd          %eax,%xmm0
   DB  102,15,112,216,0                    ; pshufd        $0x0,%xmm0,%xmm3
-  DB  102,15,111,195                      ; movdqa        %xmm3,%xmm0
-  DB  102,15,101,193                      ; pcmpgtw       %xmm1,%xmm0
-  DB  102,15,223,193                      ; pandn         %xmm1,%xmm0
-  DB  102,69,15,239,201                   ; pxor          %xmm9,%xmm9
-  DB  102,65,15,97,193                    ; punpcklwd     %xmm9,%xmm0
-  DB  102,15,114,240,13                   ; pslld         $0xd,%xmm0
-  DB  184,0,0,128,119                     ; mov           $0x77800000,%eax
+  DB  102,65,15,111,197                   ; movdqa        %xmm13,%xmm0
+  DB  102,15,219,195                      ; pand          %xmm3,%xmm0
+  DB  102,68,15,239,233                   ; pxor          %xmm1,%xmm13
+  DB  102,15,114,241,16                   ; pslld         $0x10,%xmm1
+  DB  102,65,15,114,245,13                ; pslld         $0xd,%xmm13
+  DB  184,0,0,0,56                        ; mov           $0x38000000,%eax
   DB  102,15,110,208                      ; movd          %eax,%xmm2
-  DB  102,68,15,112,210,0                 ; pshufd        $0x0,%xmm2,%xmm10
-  DB  65,15,89,194                        ; mulps         %xmm10,%xmm0
-  DB  102,15,112,209,78                   ; pshufd        $0x4e,%xmm1,%xmm2
-  DB  102,15,111,203                      ; movdqa        %xmm3,%xmm1
-  DB  102,15,101,202                      ; pcmpgtw       %xmm2,%xmm1
+  DB  102,68,15,112,218,0                 ; pshufd        $0x0,%xmm2,%xmm11
+  DB  102,65,15,254,203                   ; paddd         %xmm11,%xmm1
+  DB  102,65,15,254,205                   ; paddd         %xmm13,%xmm1
+  DB  102,65,15,118,193                   ; pcmpeqd       %xmm9,%xmm0
+  DB  102,15,223,193                      ; pandn         %xmm1,%xmm0
+  DB  102,65,15,115,220,8                 ; psrldq        $0x8,%xmm12
+  DB  102,69,15,97,225                    ; punpcklwd     %xmm9,%xmm12
+  DB  102,65,15,111,212                   ; movdqa        %xmm12,%xmm2
+  DB  102,65,15,219,210                   ; pand          %xmm10,%xmm2
+  DB  102,65,15,111,204                   ; movdqa        %xmm12,%xmm1
+  DB  102,15,219,203                      ; pand          %xmm3,%xmm1
+  DB  102,68,15,239,226                   ; pxor          %xmm2,%xmm12
+  DB  102,15,114,242,16                   ; pslld         $0x10,%xmm2
+  DB  102,65,15,114,244,13                ; pslld         $0xd,%xmm12
+  DB  102,65,15,254,211                   ; paddd         %xmm11,%xmm2
+  DB  102,65,15,254,212                   ; paddd         %xmm12,%xmm2
+  DB  102,65,15,118,201                   ; pcmpeqd       %xmm9,%xmm1
   DB  102,15,223,202                      ; pandn         %xmm2,%xmm1
-  DB  102,65,15,97,201                    ; punpcklwd     %xmm9,%xmm1
-  DB  102,15,114,241,13                   ; pslld         $0xd,%xmm1
-  DB  65,15,89,202                        ; mulps         %xmm10,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,65,15,101,208                   ; pcmpgtw       %xmm8,%xmm2
-  DB  102,65,15,223,208                   ; pandn         %xmm8,%xmm2
-  DB  102,65,15,97,209                    ; punpcklwd     %xmm9,%xmm2
-  DB  102,15,114,242,13                   ; pslld         $0xd,%xmm2
-  DB  65,15,89,210                        ; mulps         %xmm10,%xmm2
-  DB  102,69,15,112,192,78                ; pshufd        $0x4e,%xmm8,%xmm8
-  DB  102,65,15,101,216                   ; pcmpgtw       %xmm8,%xmm3
-  DB  102,65,15,223,216                   ; pandn         %xmm8,%xmm3
-  DB  102,65,15,97,217                    ; punpcklwd     %xmm9,%xmm3
-  DB  102,15,114,243,13                   ; pslld         $0xd,%xmm3
-  DB  65,15,89,218                        ; mulps         %xmm10,%xmm3
+  DB  102,69,15,111,224                   ; movdqa        %xmm8,%xmm12
+  DB  102,69,15,97,225                    ; punpcklwd     %xmm9,%xmm12
+  DB  102,69,15,111,236                   ; movdqa        %xmm12,%xmm13
+  DB  102,69,15,219,234                   ; pand          %xmm10,%xmm13
+  DB  102,65,15,111,212                   ; movdqa        %xmm12,%xmm2
+  DB  102,15,219,211                      ; pand          %xmm3,%xmm2
+  DB  102,69,15,239,229                   ; pxor          %xmm13,%xmm12
+  DB  102,65,15,114,245,16                ; pslld         $0x10,%xmm13
+  DB  102,65,15,114,244,13                ; pslld         $0xd,%xmm12
+  DB  102,69,15,254,235                   ; paddd         %xmm11,%xmm13
+  DB  102,69,15,254,236                   ; paddd         %xmm12,%xmm13
+  DB  102,65,15,118,209                   ; pcmpeqd       %xmm9,%xmm2
+  DB  102,65,15,223,213                   ; pandn         %xmm13,%xmm2
+  DB  102,65,15,115,216,8                 ; psrldq        $0x8,%xmm8
+  DB  102,69,15,97,193                    ; punpcklwd     %xmm9,%xmm8
+  DB  102,69,15,219,208                   ; pand          %xmm8,%xmm10
+  DB  102,65,15,219,216                   ; pand          %xmm8,%xmm3
+  DB  102,69,15,239,194                   ; pxor          %xmm10,%xmm8
+  DB  102,65,15,114,242,16                ; pslld         $0x10,%xmm10
+  DB  102,65,15,114,240,13                ; pslld         $0xd,%xmm8
+  DB  102,69,15,254,211                   ; paddd         %xmm11,%xmm10
+  DB  102,69,15,254,208                   ; paddd         %xmm8,%xmm10
+  DB  102,65,15,118,217                   ; pcmpeqd       %xmm9,%xmm3
+  DB  102,65,15,223,218                   ; pandn         %xmm10,%xmm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
@@ -17251,41 +17581,69 @@
   DB  102,68,15,111,193                   ; movdqa        %xmm1,%xmm8
   DB  102,68,15,97,194                    ; punpcklwd     %xmm2,%xmm8
   DB  102,15,105,202                      ; punpckhwd     %xmm2,%xmm1
-  DB  102,65,15,111,208                   ; movdqa        %xmm8,%xmm2
-  DB  102,15,97,209                       ; punpcklwd     %xmm1,%xmm2
+  DB  102,69,15,111,224                   ; movdqa        %xmm8,%xmm12
+  DB  102,68,15,97,225                    ; punpcklwd     %xmm1,%xmm12
   DB  102,68,15,105,193                   ; punpckhwd     %xmm1,%xmm8
-  DB  184,0,4,0,4                         ; mov           $0x4000400,%eax
+  DB  102,69,15,239,201                   ; pxor          %xmm9,%xmm9
+  DB  102,69,15,111,236                   ; movdqa        %xmm12,%xmm13
+  DB  102,69,15,97,233                    ; punpcklwd     %xmm9,%xmm13
+  DB  184,0,128,0,0                       ; mov           $0x8000,%eax
+  DB  102,15,110,192                      ; movd          %eax,%xmm0
+  DB  102,68,15,112,208,0                 ; pshufd        $0x0,%xmm0,%xmm10
+  DB  102,65,15,111,205                   ; movdqa        %xmm13,%xmm1
+  DB  102,65,15,219,202                   ; pand          %xmm10,%xmm1
+  DB  184,0,124,0,0                       ; mov           $0x7c00,%eax
   DB  102,15,110,192                      ; movd          %eax,%xmm0
   DB  102,15,112,216,0                    ; pshufd        $0x0,%xmm0,%xmm3
-  DB  102,15,111,195                      ; movdqa        %xmm3,%xmm0
-  DB  102,15,101,194                      ; pcmpgtw       %xmm2,%xmm0
-  DB  102,15,223,194                      ; pandn         %xmm2,%xmm0
-  DB  102,69,15,239,201                   ; pxor          %xmm9,%xmm9
-  DB  102,65,15,97,193                    ; punpcklwd     %xmm9,%xmm0
-  DB  102,15,114,240,13                   ; pslld         $0xd,%xmm0
-  DB  184,0,0,128,119                     ; mov           $0x77800000,%eax
-  DB  102,15,110,200                      ; movd          %eax,%xmm1
-  DB  102,68,15,112,209,0                 ; pshufd        $0x0,%xmm1,%xmm10
-  DB  65,15,89,194                        ; mulps         %xmm10,%xmm0
-  DB  102,15,112,210,78                   ; pshufd        $0x4e,%xmm2,%xmm2
-  DB  102,15,111,203                      ; movdqa        %xmm3,%xmm1
-  DB  102,15,101,202                      ; pcmpgtw       %xmm2,%xmm1
+  DB  102,65,15,111,197                   ; movdqa        %xmm13,%xmm0
+  DB  102,15,219,195                      ; pand          %xmm3,%xmm0
+  DB  102,68,15,239,233                   ; pxor          %xmm1,%xmm13
+  DB  102,15,114,241,16                   ; pslld         $0x10,%xmm1
+  DB  102,65,15,114,245,13                ; pslld         $0xd,%xmm13
+  DB  184,0,0,0,56                        ; mov           $0x38000000,%eax
+  DB  102,15,110,208                      ; movd          %eax,%xmm2
+  DB  102,68,15,112,218,0                 ; pshufd        $0x0,%xmm2,%xmm11
+  DB  102,65,15,254,203                   ; paddd         %xmm11,%xmm1
+  DB  102,65,15,254,205                   ; paddd         %xmm13,%xmm1
+  DB  102,65,15,118,193                   ; pcmpeqd       %xmm9,%xmm0
+  DB  102,15,223,193                      ; pandn         %xmm1,%xmm0
+  DB  102,65,15,115,220,8                 ; psrldq        $0x8,%xmm12
+  DB  102,69,15,97,225                    ; punpcklwd     %xmm9,%xmm12
+  DB  102,65,15,111,212                   ; movdqa        %xmm12,%xmm2
+  DB  102,65,15,219,210                   ; pand          %xmm10,%xmm2
+  DB  102,65,15,111,204                   ; movdqa        %xmm12,%xmm1
+  DB  102,15,219,203                      ; pand          %xmm3,%xmm1
+  DB  102,68,15,239,226                   ; pxor          %xmm2,%xmm12
+  DB  102,15,114,242,16                   ; pslld         $0x10,%xmm2
+  DB  102,65,15,114,244,13                ; pslld         $0xd,%xmm12
+  DB  102,65,15,254,211                   ; paddd         %xmm11,%xmm2
+  DB  102,65,15,254,212                   ; paddd         %xmm12,%xmm2
+  DB  102,65,15,118,201                   ; pcmpeqd       %xmm9,%xmm1
   DB  102,15,223,202                      ; pandn         %xmm2,%xmm1
-  DB  102,65,15,97,201                    ; punpcklwd     %xmm9,%xmm1
-  DB  102,15,114,241,13                   ; pslld         $0xd,%xmm1
-  DB  65,15,89,202                        ; mulps         %xmm10,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,65,15,101,208                   ; pcmpgtw       %xmm8,%xmm2
-  DB  102,65,15,223,208                   ; pandn         %xmm8,%xmm2
-  DB  102,65,15,97,209                    ; punpcklwd     %xmm9,%xmm2
-  DB  102,15,114,242,13                   ; pslld         $0xd,%xmm2
-  DB  65,15,89,210                        ; mulps         %xmm10,%xmm2
-  DB  102,69,15,112,192,78                ; pshufd        $0x4e,%xmm8,%xmm8
-  DB  102,65,15,101,216                   ; pcmpgtw       %xmm8,%xmm3
-  DB  102,65,15,223,216                   ; pandn         %xmm8,%xmm3
-  DB  102,65,15,97,217                    ; punpcklwd     %xmm9,%xmm3
-  DB  102,15,114,243,13                   ; pslld         $0xd,%xmm3
-  DB  65,15,89,218                        ; mulps         %xmm10,%xmm3
+  DB  102,69,15,111,224                   ; movdqa        %xmm8,%xmm12
+  DB  102,69,15,97,225                    ; punpcklwd     %xmm9,%xmm12
+  DB  102,69,15,111,236                   ; movdqa        %xmm12,%xmm13
+  DB  102,69,15,219,234                   ; pand          %xmm10,%xmm13
+  DB  102,65,15,111,212                   ; movdqa        %xmm12,%xmm2
+  DB  102,15,219,211                      ; pand          %xmm3,%xmm2
+  DB  102,69,15,239,229                   ; pxor          %xmm13,%xmm12
+  DB  102,65,15,114,245,16                ; pslld         $0x10,%xmm13
+  DB  102,65,15,114,244,13                ; pslld         $0xd,%xmm12
+  DB  102,69,15,254,235                   ; paddd         %xmm11,%xmm13
+  DB  102,69,15,254,236                   ; paddd         %xmm12,%xmm13
+  DB  102,65,15,118,209                   ; pcmpeqd       %xmm9,%xmm2
+  DB  102,65,15,223,213                   ; pandn         %xmm13,%xmm2
+  DB  102,65,15,115,216,8                 ; psrldq        $0x8,%xmm8
+  DB  102,69,15,97,193                    ; punpcklwd     %xmm9,%xmm8
+  DB  102,69,15,219,208                   ; pand          %xmm8,%xmm10
+  DB  102,65,15,219,216                   ; pand          %xmm8,%xmm3
+  DB  102,69,15,239,194                   ; pxor          %xmm10,%xmm8
+  DB  102,65,15,114,242,16                ; pslld         $0x10,%xmm10
+  DB  102,65,15,114,240,13                ; pslld         $0xd,%xmm8
+  DB  102,69,15,254,211                   ; paddd         %xmm11,%xmm10
+  DB  102,69,15,254,208                   ; paddd         %xmm8,%xmm10
+  DB  102,65,15,118,217                   ; pcmpeqd       %xmm9,%xmm3
+  DB  102,65,15,223,218                   ; pandn         %xmm10,%xmm3
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
@@ -17293,38 +17651,76 @@
 _sk_store_f16_sse2 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  72,139,0                            ; mov           (%rax),%rax
-  DB  185,0,0,128,7                       ; mov           $0x7800000,%ecx
+  DB  185,0,0,0,128                       ; mov           $0x80000000,%ecx
   DB  102,68,15,110,193                   ; movd          %ecx,%xmm8
   DB  102,69,15,112,200,0                 ; pshufd        $0x0,%xmm8,%xmm9
-  DB  102,69,15,111,193                   ; movdqa        %xmm9,%xmm8
-  DB  68,15,89,192                        ; mulps         %xmm0,%xmm8
-  DB  102,65,15,114,208,13                ; psrld         $0xd,%xmm8
-  DB  102,65,15,114,240,16                ; pslld         $0x10,%xmm8
-  DB  102,65,15,114,224,16                ; psrad         $0x10,%xmm8
+  DB  102,69,15,111,225                   ; movdqa        %xmm9,%xmm12
+  DB  102,68,15,219,224                   ; pand          %xmm0,%xmm12
+  DB  102,68,15,111,192                   ; movdqa        %xmm0,%xmm8
+  DB  102,69,15,239,196                   ; pxor          %xmm12,%xmm8
+  DB  185,0,0,128,56                      ; mov           $0x38800000,%ecx
+  DB  102,68,15,110,209                   ; movd          %ecx,%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  102,65,15,114,212,16                ; psrld         $0x10,%xmm12
+  DB  102,69,15,111,232                   ; movdqa        %xmm8,%xmm13
+  DB  102,65,15,114,213,13                ; psrld         $0xd,%xmm13
+  DB  185,0,192,1,0                       ; mov           $0x1c000,%ecx
+  DB  102,68,15,110,217                   ; movd          %ecx,%xmm11
+  DB  102,69,15,112,219,0                 ; pshufd        $0x0,%xmm11,%xmm11
+  DB  102,69,15,250,227                   ; psubd         %xmm11,%xmm12
+  DB  102,69,15,254,229                   ; paddd         %xmm13,%xmm12
+  DB  102,65,15,114,244,16                ; pslld         $0x10,%xmm12
+  DB  102,65,15,114,228,16                ; psrad         $0x10,%xmm12
+  DB  69,15,194,194,5                     ; cmpnltps      %xmm10,%xmm8
+  DB  69,15,84,196                        ; andps         %xmm12,%xmm8
   DB  102,69,15,107,192                   ; packssdw      %xmm8,%xmm8
-  DB  102,69,15,111,209                   ; movdqa        %xmm9,%xmm10
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  102,65,15,114,210,13                ; psrld         $0xd,%xmm10
-  DB  102,65,15,114,242,16                ; pslld         $0x10,%xmm10
-  DB  102,65,15,114,226,16                ; psrad         $0x10,%xmm10
-  DB  102,69,15,107,210                   ; packssdw      %xmm10,%xmm10
-  DB  102,69,15,111,217                   ; movdqa        %xmm9,%xmm11
-  DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
-  DB  102,65,15,114,211,13                ; psrld         $0xd,%xmm11
-  DB  102,65,15,114,243,16                ; pslld         $0x10,%xmm11
-  DB  102,65,15,114,227,16                ; psrad         $0x10,%xmm11
-  DB  102,69,15,107,219                   ; packssdw      %xmm11,%xmm11
-  DB  68,15,89,203                        ; mulps         %xmm3,%xmm9
-  DB  102,65,15,114,209,13                ; psrld         $0xd,%xmm9
+  DB  102,69,15,111,233                   ; movdqa        %xmm9,%xmm13
+  DB  102,68,15,219,233                   ; pand          %xmm1,%xmm13
+  DB  102,68,15,111,225                   ; movdqa        %xmm1,%xmm12
+  DB  102,69,15,239,229                   ; pxor          %xmm13,%xmm12
+  DB  102,65,15,114,213,16                ; psrld         $0x10,%xmm13
+  DB  102,69,15,111,244                   ; movdqa        %xmm12,%xmm14
+  DB  102,65,15,114,214,13                ; psrld         $0xd,%xmm14
+  DB  102,69,15,250,235                   ; psubd         %xmm11,%xmm13
+  DB  102,69,15,254,238                   ; paddd         %xmm14,%xmm13
+  DB  102,65,15,114,245,16                ; pslld         $0x10,%xmm13
+  DB  102,65,15,114,229,16                ; psrad         $0x10,%xmm13
+  DB  69,15,194,226,5                     ; cmpnltps      %xmm10,%xmm12
+  DB  69,15,84,229                        ; andps         %xmm13,%xmm12
+  DB  102,69,15,107,228                   ; packssdw      %xmm12,%xmm12
+  DB  102,69,15,111,241                   ; movdqa        %xmm9,%xmm14
+  DB  102,68,15,219,242                   ; pand          %xmm2,%xmm14
+  DB  102,68,15,111,234                   ; movdqa        %xmm2,%xmm13
+  DB  102,69,15,239,238                   ; pxor          %xmm14,%xmm13
+  DB  102,65,15,114,214,16                ; psrld         $0x10,%xmm14
+  DB  102,69,15,111,253                   ; movdqa        %xmm13,%xmm15
+  DB  102,65,15,114,215,13                ; psrld         $0xd,%xmm15
+  DB  102,69,15,250,243                   ; psubd         %xmm11,%xmm14
+  DB  102,69,15,254,247                   ; paddd         %xmm15,%xmm14
+  DB  102,65,15,114,246,16                ; pslld         $0x10,%xmm14
+  DB  102,65,15,114,230,16                ; psrad         $0x10,%xmm14
+  DB  69,15,194,234,5                     ; cmpnltps      %xmm10,%xmm13
+  DB  69,15,84,238                        ; andps         %xmm14,%xmm13
+  DB  102,69,15,107,237                   ; packssdw      %xmm13,%xmm13
+  DB  102,68,15,219,203                   ; pand          %xmm3,%xmm9
+  DB  102,68,15,111,243                   ; movdqa        %xmm3,%xmm14
+  DB  102,69,15,239,241                   ; pxor          %xmm9,%xmm14
+  DB  102,65,15,114,209,16                ; psrld         $0x10,%xmm9
+  DB  102,69,15,111,254                   ; movdqa        %xmm14,%xmm15
+  DB  102,65,15,114,215,13                ; psrld         $0xd,%xmm15
+  DB  102,69,15,250,203                   ; psubd         %xmm11,%xmm9
+  DB  102,69,15,254,207                   ; paddd         %xmm15,%xmm9
   DB  102,65,15,114,241,16                ; pslld         $0x10,%xmm9
   DB  102,65,15,114,225,16                ; psrad         $0x10,%xmm9
-  DB  102,69,15,107,201                   ; packssdw      %xmm9,%xmm9
-  DB  102,69,15,97,194                    ; punpcklwd     %xmm10,%xmm8
-  DB  102,69,15,97,217                    ; punpcklwd     %xmm9,%xmm11
+  DB  69,15,194,242,5                     ; cmpnltps      %xmm10,%xmm14
+  DB  69,15,84,241                        ; andps         %xmm9,%xmm14
+  DB  102,69,15,107,246                   ; packssdw      %xmm14,%xmm14
+  DB  102,69,15,97,196                    ; punpcklwd     %xmm12,%xmm8
+  DB  102,69,15,97,238                    ; punpcklwd     %xmm14,%xmm13
   DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
-  DB  102,69,15,98,203                    ; punpckldq     %xmm11,%xmm9
+  DB  102,69,15,98,205                    ; punpckldq     %xmm13,%xmm9
   DB  243,68,15,127,12,248                ; movdqu        %xmm9,(%rax,%rdi,8)
-  DB  102,69,15,106,195                   ; punpckhdq     %xmm11,%xmm8
+  DB  102,69,15,106,197                   ; punpckhdq     %xmm13,%xmm8
   DB  243,68,15,127,68,248,16             ; movdqu        %xmm8,0x10(%rax,%rdi,8)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -17907,7 +18303,7 @@
   DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
   DB  72,139,8                            ; mov           (%rax),%rcx
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,132,15,1,0,0                     ; je            3d47 <_sk_linear_gradient_sse2+0x149>
+  DB  15,132,15,1,0,0                     ; je            3f3e <_sk_linear_gradient_sse2+0x149>
   DB  72,139,64,8                         ; mov           0x8(%rax),%rax
   DB  72,131,192,32                       ; add           $0x20,%rax
   DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
@@ -17968,8 +18364,8 @@
   DB  69,15,86,231                        ; orps          %xmm15,%xmm12
   DB  72,131,192,36                       ; add           $0x24,%rax
   DB  72,255,201                          ; dec           %rcx
-  DB  15,133,8,255,255,255                ; jne           3c4d <_sk_linear_gradient_sse2+0x4f>
-  DB  235,13                              ; jmp           3d54 <_sk_linear_gradient_sse2+0x156>
+  DB  15,133,8,255,255,255                ; jne           3e44 <_sk_linear_gradient_sse2+0x4f>
+  DB  235,13                              ; jmp           3f4b <_sk_linear_gradient_sse2+0x156>
   DB  15,87,201                           ; xorps         %xmm1,%xmm1
   DB  15,87,210                           ; xorps         %xmm2,%xmm2
   DB  15,87,219                           ; xorps         %xmm3,%xmm3
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index bd8ad40..3cb1785 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -74,16 +74,6 @@
         ptr[3] = a;
     }
 
-    SI F from_half(U16 h) {
-        if ((int16_t)h < 0x0400) { h = 0; }   // Flush denorm and negative to zero.
-        return bit_cast<F>(h << 13)           // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000));  // then fix up the exponent.
-    }
-    SI U16 to_half(F f) {
-        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-            >> 13;                                                // then line up the mantissa.
-    }
-
 #elif defined(__aarch64__)
     #include <arm_neon.h>
 
@@ -143,9 +133,6 @@
         vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
     }
 
-    SI F from_half(U16 h) { return vcvt_f32_f16(h); }
-    SI U16 to_half(F   f) { return vcvt_f16_f32(f); }
-
 #elif defined(__arm__)
     #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
         #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@@ -222,15 +209,6 @@
         vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
     }
 
-    SI F from_half(U16 h) {
-        auto v = widen_cast<uint16x4_t>(h);
-        return vget_low_f32(vcvt_f32_f16(v));
-    }
-    SI U16 to_half(F f) {
-        auto v = widen_cast<float32x4_t>(f);
-        uint16x4_t h = vcvt_f16_f32(v);
-        return unaligned_load<U16>(&h);
-    }
 
 #elif defined(__AVX__)
     #include <immintrin.h>
@@ -445,29 +423,6 @@
         }
     }
 
-    SI F from_half(U16 h) {
-    #if defined(__AVX2__)
-        return _mm256_cvtph_ps(h);
-    #else
-        // This technique would slow down ~10x for denorm inputs, so we flush them to zero.
-        // With a signed comparison this conveniently also flushes negative half floats to zero.
-        h = _mm_andnot_si128(_mm_cmplt_epi16(h, _mm_set1_epi32(0x04000400_i)), h);
-
-        U32 w = _mm256_setr_m128i(_mm_unpacklo_epi16(h, _mm_setzero_si128()),
-                                  _mm_unpackhi_epi16(h, _mm_setzero_si128()));
-        return bit_cast<F>(w << 13)             // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000_i));  // then fix up the exponent.
-    #endif
-    }
-    SI U16 to_half(F f) {
-    #if defined(__AVX2__)
-        return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
-    #else
-        return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-                    >> 13);                                            // then line up the mantissa.
-    #endif
-    }
-
 #elif defined(__SSE2__)
     #include <immintrin.h>
 
@@ -582,21 +537,6 @@
         _mm_storeu_ps(ptr+ 8, b);
         _mm_storeu_ps(ptr+12, a);
     }
-
-    SI F from_half(U16 h) {
-        auto v = widen_cast<__m128i>(h);
-
-        // Same deal as AVX: flush denorms and negatives to zero.
-        v = _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
-
-        U32 w = _mm_unpacklo_epi16(v, _mm_setzero_si128());
-        return bit_cast<F>(w << 13)             // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000_i));  // then fix up the exponent.
-    }
-    SI U16 to_half(F f) {
-        return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-                    >> 13);                                            // then line up the mantissa.
-    }
 #endif
 
 // We need to be a careful with casts.
@@ -614,6 +554,11 @@
     SI U32 expand(U8  v) { return (U32)v; }
 #endif
 
+template <typename V>
+SI V if_then_else(I32 c, V t, V e) {
+    return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e)));
+}
+
 SI U16 bswap(U16 x) {
 #if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__)
     // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
@@ -652,4 +597,55 @@
     return approx_pow2(approx_log2(x) * y);
 }
 
+SI F from_half(U16 h) {
+#if defined(JUMPER) && defined(__aarch64__)
+    return vcvt_f32_f16(h);
+
+#elif defined(JUMPER) && defined(__arm__)
+    auto v = widen_cast<uint16x4_t>(h);
+    return vget_low_f32(vcvt_f32_f16(v));
+
+#elif defined(JUMPER) && defined(__AVX2__)
+    return _mm256_cvtph_ps(h);
+
+#else
+    // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
+    U32 sem = expand(h),
+        s   = sem & 0x8000_i,
+         e  = sem & 0x7c00_i,
+         em = sem ^ s;
+
+    // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
+    return if_then_else(e == 0, 0
+                              , bit_cast<F>( (s<<16) + (em<<13) + C((127-15)<<23) ));
+#endif
+}
+
+SI U16 to_half(F f) {
+#if defined(JUMPER) && defined(__aarch64__)
+    return vcvt_f16_f32(f);
+
+#elif defined(JUMPER) && defined(__arm__)
+    auto v = widen_cast<float32x4_t>(f);
+    uint16x4_t h = vcvt_f16_f32(v);
+    return unaligned_load<U16>(&h);
+
+#elif defined(JUMPER) && defined(__AVX2__)
+    return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
+
+#else
+    // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
+    U32 sem = bit_cast<U32>(f),
+        s   = sem & 0x80000000_i,
+         em = sem ^ s;
+
+    // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
+    auto denorm = bit_cast<F>(em) < C(1.0f / (1<<14));
+    return pack(if_then_else(denorm, U32(0)
+                                   , (s>>16) + (em>>13) - C((127-15)<<10)));
+#endif
+}
+
+
+
 #endif//SkJumper_vectors_DEFINED
diff --git a/tests/F16StagesTest.cpp b/tests/F16StagesTest.cpp
new file mode 100644
index 0000000..73072e3
--- /dev/null
+++ b/tests/F16StagesTest.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkRasterPipeline.h"
+#include "Test.h"
+
+DEF_TEST(F16Stages, r) {
+    // Make sure SkRasterPipeline::load_f16 and store_f16 can handle a range of
+    // ordinary (0<=x<=1) and interesting (x<0, x>1) values.
+    float floats[16] = {
+        0.0f, 0.25f, 0.5f, 1.0f,
+        -1.25f, -0.5f, 1.25f, 2.0f,
+        0,0,0,0, 0,0,0,0,  // pad a bit to make sure we qualify for platform-specific code
+    };
+    uint16_t halfs[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
+
+    float*    f32 = floats;
+    uint16_t* f16 = halfs;
+
+    {
+        SkRasterPipeline p;
+        p.append(SkRasterPipeline:: load_f32, &f32);
+        p.append(SkRasterPipeline::store_f16, &f16);
+        p.run(0,16/4);
+    }
+    REPORTER_ASSERT(r, f16[0] == 0x0000);
+    REPORTER_ASSERT(r, f16[1] == 0x3400);
+    REPORTER_ASSERT(r, f16[2] == 0x3800);
+    REPORTER_ASSERT(r, f16[3] == 0x3c00);
+    REPORTER_ASSERT(r, f16[4] == 0xbd00);
+    REPORTER_ASSERT(r, f16[5] == 0xb800);
+    REPORTER_ASSERT(r, f16[6] == 0x3d00);
+    REPORTER_ASSERT(r, f16[7] == 0x4000);
+
+    {
+        SkRasterPipeline p;
+        p.append(SkRasterPipeline:: load_f16, &f16);
+        p.append(SkRasterPipeline::store_f32, &f32);
+        p.run(0,16/4);
+    }
+    REPORTER_ASSERT(r, f32[0] ==  0.00f);
+    REPORTER_ASSERT(r, f32[1] ==  0.25f);
+    REPORTER_ASSERT(r, f32[2] ==  0.50f);
+    REPORTER_ASSERT(r, f32[3] ==  1.00f);
+    REPORTER_ASSERT(r, f32[4] == -1.25f);
+    REPORTER_ASSERT(r, f32[5] == -0.50f);
+    REPORTER_ASSERT(r, f32[6] ==  1.25f);
+    REPORTER_ASSERT(r, f32[7] ==  2.00f);
+}