SkJumper: perspective matrix

Change-Id: I2c63e0996e4689950f8f3b82da0fb07941c26044
Reviewed-on: https://skia-review.googlesource.com/8952
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 0821666..488caf6 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -46,47 +46,48 @@
     31.0f, 63.0f,
 };
 
-#define STAGES(M)     \
-    M(seed_shader)    \
-    M(constant_color) \
-    M(clear)          \
-    M(plus_)          \
-    M(srcover)        \
-    M(dstover)        \
-    M(clamp_0)        \
-    M(clamp_1)        \
-    M(clamp_a)        \
-    M(set_rgb)        \
-    M(swap_rb)        \
-    M(swap)           \
-    M(move_src_dst)   \
-    M(move_dst_src)   \
-    M(premul)         \
-    M(unpremul)       \
-    M(from_srgb)      \
-    M(to_srgb)        \
-    M(scale_1_float)  \
-    M(scale_u8)       \
-    M(lerp_1_float)   \
-    M(lerp_u8)        \
-    M(lerp_565)       \
-    M(load_tables)    \
-    M(load_a8)        \
-    M(store_a8)       \
-    M(load_565)       \
-    M(store_565)      \
-    M(load_8888)      \
-    M(store_8888)     \
-    M(load_f16)       \
-    M(store_f16)      \
-    M(matrix_2x3)     \
-    M(matrix_3x4)     \
-    M(clamp_x)        \
-    M(clamp_y)        \
-    M(repeat_x)       \
-    M(repeat_y)       \
-    M(mirror_x)       \
-    M(mirror_y)       \
+#define STAGES(M)         \
+    M(seed_shader)        \
+    M(constant_color)     \
+    M(clear)              \
+    M(plus_)              \
+    M(srcover)            \
+    M(dstover)            \
+    M(clamp_0)            \
+    M(clamp_1)            \
+    M(clamp_a)            \
+    M(set_rgb)            \
+    M(swap_rb)            \
+    M(swap)               \
+    M(move_src_dst)       \
+    M(move_dst_src)       \
+    M(premul)             \
+    M(unpremul)           \
+    M(from_srgb)          \
+    M(to_srgb)            \
+    M(scale_1_float)      \
+    M(scale_u8)           \
+    M(lerp_1_float)       \
+    M(lerp_u8)            \
+    M(lerp_565)           \
+    M(load_tables)        \
+    M(load_a8)            \
+    M(store_a8)           \
+    M(load_565)           \
+    M(store_565)          \
+    M(load_8888)          \
+    M(store_8888)         \
+    M(load_f16)           \
+    M(store_f16)          \
+    M(matrix_2x3)         \
+    M(matrix_3x4)         \
+    M(matrix_perspective) \
+    M(clamp_x)            \
+    M(clamp_y)            \
+    M(repeat_x)           \
+    M(repeat_y)           \
+    M(mirror_x)           \
+    M(mirror_y)           \
     M(linear_gradient_2stops)
 
 // We can't express the real types of most stage functions portably, so we use a stand-in.
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 071aeea..7da1489 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -832,6 +832,33 @@
   .long  0x4eb21e42                          // mov           v2.16b, v18.16b
   .long  0xd61f0060                          // br            x3
 
+.globl _sk_matrix_perspective_aarch64
+_sk_matrix_perspective_aarch64:
+  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
+  .long  0xaa0803e9                          // mov           x9, x8
+  .long  0x9100510a                          // add           x10, x8, #0x14
+  .long  0x4ddfc930                          // ld1r          {v16.4s}, [x9], #4
+  .long  0x4d40c951                          // ld1r          {v17.4s}, [x10]
+  .long  0x9100810a                          // add           x10, x8, #0x20
+  .long  0x4d40c952                          // ld1r          {v18.4s}, [x10]
+  .long  0x2d41d113                          // ldp           s19, s20, [x8,#12]
+  .long  0x2d435915                          // ldp           s21, s22, [x8,#24]
+  .long  0x91002108                          // add           x8, x8, #0x8
+  .long  0x4f941031                          // fmla          v17.4s, v1.4s, v20.s[0]
+  .long  0x4d40c914                          // ld1r          {v20.4s}, [x8]
+  .long  0x4f961032                          // fmla          v18.4s, v1.4s, v22.s[0]
+  .long  0xbd400136                          // ldr           s22, [x9]
+  .long  0x4f951012                          // fmla          v18.4s, v0.4s, v21.s[0]
+  .long  0x4f931011                          // fmla          v17.4s, v0.4s, v19.s[0]
+  .long  0x4f961034                          // fmla          v20.4s, v1.4s, v22.s[0]
+  .long  0x4ea1da41                          // frecpe        v1.4s, v18.4s
+  .long  0x4e21fe52                          // frecps        v18.4s, v18.4s, v1.4s
+  .long  0x6e32dc32                          // fmul          v18.4s, v1.4s, v18.4s
+  .long  0x4e20ce14                          // fmla          v20.4s, v16.4s, v0.4s
+  .long  0x6e32de21                          // fmul          v1.4s, v17.4s, v18.4s
+  .long  0x6e32de80                          // fmul          v0.4s, v20.4s, v18.4s
+  .long  0xd61f0060                          // br            x3
+
 .globl _sk_linear_gradient_2stops_aarch64
 _sk_linear_gradient_2stops_aarch64:
   .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
@@ -1791,6 +1818,43 @@
   .long  0xe8bd4800                          // pop           {fp, lr}
   .long  0xe12fff1c                          // bx            ip
 
+.globl _sk_matrix_perspective_vfp4
+_sk_matrix_perspective_vfp4:
+  .long  0xe92d4800                          // push          {fp, lr}
+  .long  0xe591e000                          // ldr           lr, [r1]
+  .long  0xe591c004                          // ldr           ip, [r1, #4]
+  .long  0xe2811008                          // add           r1, r1, #8
+  .long  0xe28e301c                          // add           r3, lr, #28
+  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
+  .long  0xe28e3020                          // add           r3, lr, #32
+  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
+  .long  0xe28e3018                          // add           r3, lr, #24
+  .long  0xf2411c30                          // vfma.f32      d17, d1, d16
+  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
+  .long  0xe28e3010                          // add           r3, lr, #16
+  .long  0xf2401c30                          // vfma.f32      d17, d0, d16
+  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
+  .long  0xe28e3004                          // add           r3, lr, #4
+  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
+  .long  0xe28e3008                          // add           r3, lr, #8
+  .long  0xf4e34c9f                          // vld1.32       {d20[]}, [r3 :32]
+  .long  0xe28e3014                          // add           r3, lr, #20
+  .long  0xf2414c32                          // vfma.f32      d20, d1, d18
+  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
+  .long  0xe28e300c                          // add           r3, lr, #12
+  .long  0xf3fb3521                          // vrecpe.f32    d19, d17
+  .long  0xf2412c30                          // vfma.f32      d18, d1, d16
+  .long  0xf4e35c9f                          // vld1.32       {d21[]}, [r3 :32]
+  .long  0xf2410fb3                          // vrecps.f32    d16, d17, d19
+  .long  0xf4ee1c9f                          // vld1.32       {d17[]}, [lr :32]
+  .long  0xf2404c31                          // vfma.f32      d20, d0, d17
+  .long  0xf2402c35                          // vfma.f32      d18, d0, d21
+  .long  0xf3430db0                          // vmul.f32      d16, d19, d16
+  .long  0xf3040db0                          // vmul.f32      d0, d20, d16
+  .long  0xf3021db0                          // vmul.f32      d1, d18, d16
+  .long  0xe8bd4800                          // pop           {fp, lr}
+  .long  0xe12fff1c                          // bx            ip
+
 .globl _sk_linear_gradient_2stops_vfp4
 _sk_linear_gradient_2stops_vfp4:
   .long  0xe5913000                          // ldr           r3, [r1]
@@ -2551,6 +2615,30 @@
   .byte  197,124,41,210                      // vmovaps       %ymm10,%ymm2
   .byte  255,224                             // jmpq          *%rax
 
+.globl _sk_matrix_perspective_hsw
+_sk_matrix_perspective_hsw:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
+  .byte  196,98,125,24,72,4                  // vbroadcastss  0x4(%rax),%ymm9
+  .byte  196,98,125,24,80,8                  // vbroadcastss  0x8(%rax),%ymm10
+  .byte  196,66,117,184,209                  // vfmadd231ps   %ymm9,%ymm1,%ymm10
+  .byte  196,66,125,184,208                  // vfmadd231ps   %ymm8,%ymm0,%ymm10
+  .byte  196,98,125,24,64,12                 // vbroadcastss  0xc(%rax),%ymm8
+  .byte  196,98,125,24,72,16                 // vbroadcastss  0x10(%rax),%ymm9
+  .byte  196,98,125,24,88,20                 // vbroadcastss  0x14(%rax),%ymm11
+  .byte  196,66,117,184,217                  // vfmadd231ps   %ymm9,%ymm1,%ymm11
+  .byte  196,66,125,184,216                  // vfmadd231ps   %ymm8,%ymm0,%ymm11
+  .byte  196,98,125,24,64,24                 // vbroadcastss  0x18(%rax),%ymm8
+  .byte  196,98,125,24,72,28                 // vbroadcastss  0x1c(%rax),%ymm9
+  .byte  196,98,125,24,96,32                 // vbroadcastss  0x20(%rax),%ymm12
+  .byte  196,66,117,184,225                  // vfmadd231ps   %ymm9,%ymm1,%ymm12
+  .byte  196,66,125,184,224                  // vfmadd231ps   %ymm8,%ymm0,%ymm12
+  .byte  196,193,124,83,204                  // vrcpps        %ymm12,%ymm1
+  .byte  197,172,89,193                      // vmulps        %ymm1,%ymm10,%ymm0
+  .byte  197,164,89,201                      // vmulps        %ymm1,%ymm11,%ymm1
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+
 .globl _sk_linear_gradient_2stops_hsw
 _sk_linear_gradient_2stops_hsw:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
@@ -3551,6 +3639,36 @@
   .byte  197,124,41,201                      // vmovaps       %ymm9,%ymm1
   .byte  255,224                             // jmpq          *%rax
 
+.globl _sk_matrix_perspective_avx
+_sk_matrix_perspective_avx:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
+  .byte  196,98,125,24,72,4                  // vbroadcastss  0x4(%rax),%ymm9
+  .byte  196,98,125,24,80,8                  // vbroadcastss  0x8(%rax),%ymm10
+  .byte  197,52,89,201                       // vmulps        %ymm1,%ymm9,%ymm9
+  .byte  196,65,52,88,202                    // vaddps        %ymm10,%ymm9,%ymm9
+  .byte  197,60,89,192                       // vmulps        %ymm0,%ymm8,%ymm8
+  .byte  196,65,60,88,193                    // vaddps        %ymm9,%ymm8,%ymm8
+  .byte  196,98,125,24,72,12                 // vbroadcastss  0xc(%rax),%ymm9
+  .byte  196,98,125,24,80,16                 // vbroadcastss  0x10(%rax),%ymm10
+  .byte  196,98,125,24,88,20                 // vbroadcastss  0x14(%rax),%ymm11
+  .byte  197,44,89,209                       // vmulps        %ymm1,%ymm10,%ymm10
+  .byte  196,65,44,88,211                    // vaddps        %ymm11,%ymm10,%ymm10
+  .byte  197,52,89,200                       // vmulps        %ymm0,%ymm9,%ymm9
+  .byte  196,65,52,88,202                    // vaddps        %ymm10,%ymm9,%ymm9
+  .byte  196,98,125,24,80,24                 // vbroadcastss  0x18(%rax),%ymm10
+  .byte  196,98,125,24,88,28                 // vbroadcastss  0x1c(%rax),%ymm11
+  .byte  196,98,125,24,96,32                 // vbroadcastss  0x20(%rax),%ymm12
+  .byte  197,164,89,201                      // vmulps        %ymm1,%ymm11,%ymm1
+  .byte  196,193,116,88,204                  // vaddps        %ymm12,%ymm1,%ymm1
+  .byte  197,172,89,192                      // vmulps        %ymm0,%ymm10,%ymm0
+  .byte  197,252,88,193                      // vaddps        %ymm1,%ymm0,%ymm0
+  .byte  197,252,83,200                      // vrcpps        %ymm0,%ymm1
+  .byte  197,188,89,193                      // vmulps        %ymm1,%ymm8,%ymm0
+  .byte  197,180,89,201                      // vmulps        %ymm1,%ymm9,%ymm1
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+
 .globl _sk_linear_gradient_2stops_avx
 _sk_linear_gradient_2stops_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
@@ -4525,6 +4643,47 @@
   .byte  65,15,40,210                        // movaps        %xmm10,%xmm2
   .byte  255,224                             // jmpq          *%rax
 
+.globl _sk_matrix_perspective_sse41
+_sk_matrix_perspective_sse41:
+  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  243,15,16,0                         // movss         (%rax),%xmm0
+  .byte  243,68,15,16,72,4                   // movss         0x4(%rax),%xmm9
+  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
+  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
+  .byte  243,68,15,16,80,8                   // movss         0x8(%rax),%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
+  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
+  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
+  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
+  .byte  243,68,15,16,72,12                  // movss         0xc(%rax),%xmm9
+  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
+  .byte  243,68,15,16,80,16                  // movss         0x10(%rax),%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  243,68,15,16,88,20                  // movss         0x14(%rax),%xmm11
+  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
+  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
+  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
+  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
+  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
+  .byte  243,68,15,16,80,24                  // movss         0x18(%rax),%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  243,68,15,16,88,28                  // movss         0x1c(%rax),%xmm11
+  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
+  .byte  243,68,15,16,96,32                  // movss         0x20(%rax),%xmm12
+  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
+  .byte  68,15,89,217                        // mulps         %xmm1,%xmm11
+  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
+  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
+  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
+  .byte  65,15,83,202                        // rcpps         %xmm10,%xmm1
+  .byte  15,89,193                           // mulps         %xmm1,%xmm0
+  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  65,15,40,201                        // movaps        %xmm9,%xmm1
+  .byte  255,224                             // jmpq          *%rax
+
 .globl _sk_linear_gradient_2stops_sse41
 _sk_linear_gradient_2stops_sse41:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
@@ -5556,6 +5715,47 @@
   .byte  65,15,40,210                        // movaps        %xmm10,%xmm2
   .byte  255,224                             // jmpq          *%rax
 
+.globl _sk_matrix_perspective_sse2
+_sk_matrix_perspective_sse2:
+  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  243,15,16,0                         // movss         (%rax),%xmm0
+  .byte  243,68,15,16,72,4                   // movss         0x4(%rax),%xmm9
+  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
+  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
+  .byte  243,68,15,16,80,8                   // movss         0x8(%rax),%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
+  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
+  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
+  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
+  .byte  243,68,15,16,72,12                  // movss         0xc(%rax),%xmm9
+  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
+  .byte  243,68,15,16,80,16                  // movss         0x10(%rax),%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  243,68,15,16,88,20                  // movss         0x14(%rax),%xmm11
+  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
+  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
+  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
+  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
+  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
+  .byte  243,68,15,16,80,24                  // movss         0x18(%rax),%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  243,68,15,16,88,28                  // movss         0x1c(%rax),%xmm11
+  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
+  .byte  243,68,15,16,96,32                  // movss         0x20(%rax),%xmm12
+  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
+  .byte  68,15,89,217                        // mulps         %xmm1,%xmm11
+  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
+  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
+  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
+  .byte  65,15,83,202                        // rcpps         %xmm10,%xmm1
+  .byte  15,89,193                           // mulps         %xmm1,%xmm0
+  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  65,15,40,201                        // movaps        %xmm9,%xmm1
+  .byte  255,224                             // jmpq          *%rax
+
 .globl _sk_linear_gradient_2stops_sse2
 _sk_linear_gradient_2stops_sse2:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 7c38fc0..eb7359d 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -772,6 +772,30 @@
   DB  197,124,41,210                      ; vmovaps       %ymm10,%ymm2
   DB  255,224                             ; jmpq          *%rax
 
+PUBLIC _sk_matrix_perspective_hsw
+_sk_matrix_perspective_hsw LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
+  DB  196,98,125,24,72,4                  ; vbroadcastss  0x4(%rax),%ymm9
+  DB  196,98,125,24,80,8                  ; vbroadcastss  0x8(%rax),%ymm10
+  DB  196,66,117,184,209                  ; vfmadd231ps   %ymm9,%ymm1,%ymm10
+  DB  196,66,125,184,208                  ; vfmadd231ps   %ymm8,%ymm0,%ymm10
+  DB  196,98,125,24,64,12                 ; vbroadcastss  0xc(%rax),%ymm8
+  DB  196,98,125,24,72,16                 ; vbroadcastss  0x10(%rax),%ymm9
+  DB  196,98,125,24,88,20                 ; vbroadcastss  0x14(%rax),%ymm11
+  DB  196,66,117,184,217                  ; vfmadd231ps   %ymm9,%ymm1,%ymm11
+  DB  196,66,125,184,216                  ; vfmadd231ps   %ymm8,%ymm0,%ymm11
+  DB  196,98,125,24,64,24                 ; vbroadcastss  0x18(%rax),%ymm8
+  DB  196,98,125,24,72,28                 ; vbroadcastss  0x1c(%rax),%ymm9
+  DB  196,98,125,24,96,32                 ; vbroadcastss  0x20(%rax),%ymm12
+  DB  196,66,117,184,225                  ; vfmadd231ps   %ymm9,%ymm1,%ymm12
+  DB  196,66,125,184,224                  ; vfmadd231ps   %ymm8,%ymm0,%ymm12
+  DB  196,193,124,83,204                  ; vrcpps        %ymm12,%ymm1
+  DB  197,172,89,193                      ; vmulps        %ymm1,%ymm10,%ymm0
+  DB  197,164,89,201                      ; vmulps        %ymm1,%ymm11,%ymm1
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+
 PUBLIC _sk_linear_gradient_2stops_hsw
 _sk_linear_gradient_2stops_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -1799,6 +1823,36 @@
   DB  197,124,41,201                      ; vmovaps       %ymm9,%ymm1
   DB  255,224                             ; jmpq          *%rax
 
+PUBLIC _sk_matrix_perspective_avx
+_sk_matrix_perspective_avx LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
+  DB  196,98,125,24,72,4                  ; vbroadcastss  0x4(%rax),%ymm9
+  DB  196,98,125,24,80,8                  ; vbroadcastss  0x8(%rax),%ymm10
+  DB  197,52,89,201                       ; vmulps        %ymm1,%ymm9,%ymm9
+  DB  196,65,52,88,202                    ; vaddps        %ymm10,%ymm9,%ymm9
+  DB  197,60,89,192                       ; vmulps        %ymm0,%ymm8,%ymm8
+  DB  196,65,60,88,193                    ; vaddps        %ymm9,%ymm8,%ymm8
+  DB  196,98,125,24,72,12                 ; vbroadcastss  0xc(%rax),%ymm9
+  DB  196,98,125,24,80,16                 ; vbroadcastss  0x10(%rax),%ymm10
+  DB  196,98,125,24,88,20                 ; vbroadcastss  0x14(%rax),%ymm11
+  DB  197,44,89,209                       ; vmulps        %ymm1,%ymm10,%ymm10
+  DB  196,65,44,88,211                    ; vaddps        %ymm11,%ymm10,%ymm10
+  DB  197,52,89,200                       ; vmulps        %ymm0,%ymm9,%ymm9
+  DB  196,65,52,88,202                    ; vaddps        %ymm10,%ymm9,%ymm9
+  DB  196,98,125,24,80,24                 ; vbroadcastss  0x18(%rax),%ymm10
+  DB  196,98,125,24,88,28                 ; vbroadcastss  0x1c(%rax),%ymm11
+  DB  196,98,125,24,96,32                 ; vbroadcastss  0x20(%rax),%ymm12
+  DB  197,164,89,201                      ; vmulps        %ymm1,%ymm11,%ymm1
+  DB  196,193,116,88,204                  ; vaddps        %ymm12,%ymm1,%ymm1
+  DB  197,172,89,192                      ; vmulps        %ymm0,%ymm10,%ymm0
+  DB  197,252,88,193                      ; vaddps        %ymm1,%ymm0,%ymm0
+  DB  197,252,83,200                      ; vrcpps        %ymm0,%ymm1
+  DB  197,188,89,193                      ; vmulps        %ymm1,%ymm8,%ymm0
+  DB  197,180,89,201                      ; vmulps        %ymm1,%ymm9,%ymm1
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+
 PUBLIC _sk_linear_gradient_2stops_avx
 _sk_linear_gradient_2stops_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -2800,6 +2854,47 @@
   DB  65,15,40,210                        ; movaps        %xmm10,%xmm2
   DB  255,224                             ; jmpq          *%rax
 
+PUBLIC _sk_matrix_perspective_sse41
+_sk_matrix_perspective_sse41 LABEL PROC
+  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  243,15,16,0                         ; movss         (%rax),%xmm0
+  DB  243,68,15,16,72,4                   ; movss         0x4(%rax),%xmm9
+  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
+  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
+  DB  243,68,15,16,80,8                   ; movss         0x8(%rax),%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
+  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
+  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
+  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
+  DB  243,68,15,16,72,12                  ; movss         0xc(%rax),%xmm9
+  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
+  DB  243,68,15,16,80,16                  ; movss         0x10(%rax),%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  243,68,15,16,88,20                  ; movss         0x14(%rax),%xmm11
+  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
+  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
+  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
+  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
+  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
+  DB  243,68,15,16,80,24                  ; movss         0x18(%rax),%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  243,68,15,16,88,28                  ; movss         0x1c(%rax),%xmm11
+  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
+  DB  243,68,15,16,96,32                  ; movss         0x20(%rax),%xmm12
+  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
+  DB  68,15,89,217                        ; mulps         %xmm1,%xmm11
+  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
+  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
+  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
+  DB  65,15,83,202                        ; rcpps         %xmm10,%xmm1
+  DB  15,89,193                           ; mulps         %xmm1,%xmm0
+  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  65,15,40,201                        ; movaps        %xmm9,%xmm1
+  DB  255,224                             ; jmpq          *%rax
+
 PUBLIC _sk_linear_gradient_2stops_sse41
 _sk_linear_gradient_2stops_sse41 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -3858,6 +3953,47 @@
   DB  65,15,40,210                        ; movaps        %xmm10,%xmm2
   DB  255,224                             ; jmpq          *%rax
 
+PUBLIC _sk_matrix_perspective_sse2
+_sk_matrix_perspective_sse2 LABEL PROC
+  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  243,15,16,0                         ; movss         (%rax),%xmm0
+  DB  243,68,15,16,72,4                   ; movss         0x4(%rax),%xmm9
+  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
+  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
+  DB  243,68,15,16,80,8                   ; movss         0x8(%rax),%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
+  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
+  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
+  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
+  DB  243,68,15,16,72,12                  ; movss         0xc(%rax),%xmm9
+  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
+  DB  243,68,15,16,80,16                  ; movss         0x10(%rax),%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  243,68,15,16,88,20                  ; movss         0x14(%rax),%xmm11
+  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
+  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
+  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
+  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
+  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
+  DB  243,68,15,16,80,24                  ; movss         0x18(%rax),%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  243,68,15,16,88,28                  ; movss         0x1c(%rax),%xmm11
+  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
+  DB  243,68,15,16,96,32                  ; movss         0x20(%rax),%xmm12
+  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
+  DB  68,15,89,217                        ; mulps         %xmm1,%xmm11
+  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
+  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
+  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
+  DB  65,15,83,202                        ; rcpps         %xmm10,%xmm1
+  DB  15,89,193                           ; mulps         %xmm1,%xmm0
+  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  65,15,40,201                        ; movaps        %xmm9,%xmm1
+  DB  255,224                             ; jmpq          *%rax
+
 PUBLIC _sk_linear_gradient_2stops_sse2
 _sk_linear_gradient_2stops_sse2 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index dfcd786..2b90fe2 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -859,6 +859,16 @@
     g = G;
     b = B;
 }
+STAGE(matrix_perspective) {
+    // N.B. Unlike the other matrix_ stages, this matrix is row-major.
+    auto m = (const float*)ctx;
+
+    auto R = mad(r,m[0], mad(g,m[1], m[2])),
+         G = mad(r,m[3], mad(g,m[4], m[5])),
+         Z = mad(r,m[6], mad(g,m[7], m[8]));
+    r = R * rcp(Z);
+    g = G * rcp(Z);
+}
 
 STAGE(linear_gradient_2stops) {
     struct Ctx { F4 c0, dc; };