New analytic AA scan converter using delta (I call it DAA for now)

DAA is:

1. Much simpler than AAA.
   SkScan_AAAPath.cpp is about 1700 lines.
   SkScan_DAAPath.cpp is about 300 lines.
   The whole DAA CL is only about 800 lines.

2. Much faster than AAA for complicated paths.
   The speedup applies to GL backend (including ccpr)!
   Here's the frame time of 'SampleApp --slide Chart' on macbook pro:
     AAA-raster: 33ms
     DAA-raster: 21ms
     AAA-gl:     30ms
     DAA-gl:     20ms
     AAA-ccpr:   18ms
     DAA-ccpr:   12ms
   My linux desktop doesn't have SSE3 so the speedup is smaller
   (~25% for Chart). I believe that DAA is so fast that I can enable
   it for any paths (AAA is not enabled by default for complicated
   paths because it is slow; hence our older supersampling scan
   converter is used for stroking on Chart for AAA-xxx config.)

3. The SkCoverageDelta is suitable for threaded backend with
   out-of-order concurrent scan conversion as commented in the source
   code. Maybe we can also just send deltas to GPU.

4. Similar to most analytic path renderers, the quality is on the best
   ground-truth level, unless there are intersections within a pixel.
   The intersections look good to my eyes although theoretically that
   could be arbitrary far from the ground truth (see my AAA slides).

5. For simple paths, such as circle, triangle, rrect, etc., DAA is
   slower than AAA. But DAA is faster than our older supersampling
   scan converter in most cases. As those simple paths usually don't
   constitute the bottleneck of a picture (skp or svg), I strongly
   recommend use DAA.

6. DAA also heavily favors blitMask so it may work quite well with
   SkRasterPipeline and SkRasterPipelineBlitter.

Finally, please check https://skia-review.googlesource.com/c/22420/
which accelerate DAA by specializing blitCoverageDeltas for
SkARGB32_Blitter and SkARGB32_Black_Blitter. It brings a little(<5%)
speedup. But I couldn't figure out how to reduce the duplicate code
so I don't intend to land it.

Bug: skia:
Change-Id: I3b7ed6a727447922e645b1acb737a506e7c09a4c
Reviewed-on: https://skia-review.googlesource.com/19666
Reviewed-by: Mike Reed <reed@google.com>
Reviewed-by: Cary Clark <caryclark@google.com>
Commit-Queue: Yuqian Li <liyuqian@google.com>
diff --git a/bench/nanobench.cpp b/bench/nanobench.cpp
index 196cb22..798dd0d 100644
--- a/bench/nanobench.cpp
+++ b/bench/nanobench.cpp
@@ -1199,7 +1199,11 @@
     }
 
     gSkUseAnalyticAA = FLAGS_analyticAA;
+    gSkUseDeltaAA = FLAGS_deltaAA;
 
+    if (FLAGS_forceDeltaAA) {
+        gSkForceDeltaAA = true;
+    }
     if (FLAGS_forceAnalyticAA) {
         gSkForceAnalyticAA = true;
     }
diff --git a/dm/DM.cpp b/dm/DM.cpp
index 698a79c..8be3fbc 100644
--- a/dm/DM.cpp
+++ b/dm/DM.cpp
@@ -1289,6 +1289,7 @@
     setup_crash_handler();
 
     gSkUseAnalyticAA = FLAGS_analyticAA;
+    gSkUseDeltaAA = FLAGS_deltaAA;
 
     if (FLAGS_forceAnalyticAA) {
         gSkForceAnalyticAA = true;
diff --git a/gn/core.gni b/gn/core.gni
index d67a26b..ab88241 100644
--- a/gn/core.gni
+++ b/gn/core.gni
@@ -59,6 +59,8 @@
   "$_src/core/SkCachedData.cpp",
   "$_src/core/SkCanvas.cpp",
   "$_src/core/SkCanvasPriv.h",
+  "$_src/core/SkCoverageDelta.h",
+  "$_src/core/SkCoverageDelta.cpp",
   "$_src/core/SkClipStack.cpp",
   "$_src/core/SkClipStack.h",
   "$_src/core/SkClipStackDevice.cpp",
@@ -267,6 +269,7 @@
   "$_src/core/SkScan.h",
   "$_src/core/SkScanPriv.h",
   "$_src/core/SkScan_AAAPath.cpp",
+  "$_src/core/SkScan_DAAPath.cpp",
   "$_src/core/SkScan_AntiPath.cpp",
   "$_src/core/SkScan_Antihair.cpp",
   "$_src/core/SkScan_Hairline.cpp",
diff --git a/samplecode/SampleApp.cpp b/samplecode/SampleApp.cpp
index c36e2e8..0bd4435 100644
--- a/samplecode/SampleApp.cpp
+++ b/samplecode/SampleApp.cpp
@@ -1888,11 +1888,17 @@
             this->resetFPS();
             break;
         case 'A':
-            if (gSkUseAnalyticAA.load() && !gSkForceAnalyticAA.load()) {
+            if (!gSkUseAnalyticAA) {
+                gSkUseAnalyticAA = true;
+            } else if (!gSkForceAnalyticAA && !gSkUseDeltaAA) {
                 gSkForceAnalyticAA = true;
-            } else {
-                gSkUseAnalyticAA = !gSkUseAnalyticAA.load();
+            } else if (!gSkUseDeltaAA) {
                 gSkForceAnalyticAA = false;
+                gSkUseDeltaAA = true;
+            } else if (!gSkForceDeltaAA) {
+                gSkForceDeltaAA = true;
+            } else {
+                gSkUseAnalyticAA = gSkForceAnalyticAA = gSkUseDeltaAA = gSkForceDeltaAA = false;
             }
             this->inval(nullptr);
             this->updateTitle();
@@ -2255,7 +2261,13 @@
         title.prependf("[T%d/%d] ", gSampleWindow->getTiles(), gSampleWindow->getThreads());
     }
 
-    if (gSkUseAnalyticAA) {
+    if (gSkUseDeltaAA) {
+        if (gSkForceDeltaAA) {
+            title.prepend("<FDAA> ");
+        } else {
+            title.prepend("<DAA> ");
+        }
+    } else if (gSkUseAnalyticAA) {
         if (gSkForceAnalyticAA) {
             title.prepend("<FAAA> ");
         } else {
diff --git a/src/core/SkAAClip.cpp b/src/core/SkAAClip.cpp
index 467939b..100ac9c 100644
--- a/src/core/SkAAClip.cpp
+++ b/src/core/SkAAClip.cpp
@@ -1421,7 +1421,9 @@
     BuilderBlitter blitter(&builder);
 
     if (doAA) {
-        if (gSkUseAnalyticAA.load()) {
+        if (!path.isConvex() && gSkUseDeltaAA.load()) {
+            SkScan::DAAFillPath(path, snugClip, &blitter, true);
+        } else if (gSkUseAnalyticAA.load()) {
             SkScan::AAAFillPath(path, snugClip, &blitter, true);
         } else {
             SkScan::AntiFillPath(path, snugClip, &blitter, true);
diff --git a/src/core/SkAnalyticEdge.h b/src/core/SkAnalyticEdge.h
index 8ecc489..533d769 100644
--- a/src/core/SkAnalyticEdge.h
+++ b/src/core/SkAnalyticEdge.h
@@ -178,4 +178,61 @@
     return true;
 }
 
+struct SkBezier {
+    int fCount; // 2 line, 3 quad, 4 cubic
+    SkPoint fP0;
+    SkPoint fP1;
+
+    // See if left shift, covert to SkFDot6, and round has the same top and bottom y.
+    // If so, the edge will be empty.
+    static inline bool IsEmpty(SkScalar y0, SkScalar y1, int shift = 2) {
+        SkScalar scale = (1 << (shift + 6));
+        return SkFDot6Round(int(y0 * scale)) == SkFDot6Round(int(y1 * scale));
+    }
+};
+
+struct SkLine : public SkBezier {
+    bool set(const SkPoint pts[2]){
+        if (IsEmpty(pts[0].fY, pts[1].fY)) {
+            return false;
+        }
+        fCount = 2;
+        fP0 = pts[0];
+        fP1 = pts[1];
+        return true;
+    }
+};
+
+struct SkQuad : public SkBezier {
+    SkPoint fP2;
+
+    bool set(const SkPoint pts[3]){
+        if (IsEmpty(pts[0].fY, pts[2].fY)) {
+            return false;
+        }
+        fCount = 3;
+        fP0 = pts[0];
+        fP1 = pts[1];
+        fP2 = pts[2];
+        return true;
+    }
+};
+
+struct SkCubic : public SkBezier {
+    SkPoint fP2;
+    SkPoint fP3;
+
+    bool set(const SkPoint pts[4]){
+        if (IsEmpty(pts[0].fY, pts[3].fY)) {
+            return false;
+        }
+        fCount = 4;
+        fP0 = pts[0];
+        fP1 = pts[1];
+        fP2 = pts[2];
+        fP3 = pts[3];
+        return true;
+    }
+};
+
 #endif
diff --git a/src/core/SkBlitter.cpp b/src/core/SkBlitter.cpp
index 5abb085..6bd48be 100644
--- a/src/core/SkBlitter.cpp
+++ b/src/core/SkBlitter.cpp
@@ -41,6 +41,110 @@
 }
  */
 
+inline static SkAlpha ScalarToAlpha(SkScalar a) {
+    return (SkAlpha)(a * 255);
+}
+
+void SkBlitter::blitFatAntiRect(const SkRect& rect) {
+    SkIRect bounds = rect.roundOut();
+    SkASSERT(bounds.width() >= 3 && bounds.height() >= 3);
+
+    int         runSize = bounds.width() + 1; // +1 so we can set runs[bounds.width()] = 0
+    void*       storage = this->allocBlitMemory(runSize * (sizeof(int16_t) + sizeof(SkAlpha)));
+    int16_t*    runs    = reinterpret_cast<int16_t*>(storage);
+    SkAlpha*    alphas  = reinterpret_cast<SkAlpha*>(runs + runSize);
+
+    runs[0] = 1;
+    runs[1] = bounds.width() - 2;
+    runs[bounds.width() - 1] = 1;
+    runs[bounds.width()]  = 0;
+
+    SkScalar partialL = bounds.fLeft + 1 - rect.fLeft;
+    SkScalar partialR = rect.fRight - (bounds.fRight - 1);
+    SkScalar partialT = bounds.fTop + 1 - rect.fTop;
+    SkScalar partialB = rect.fBottom - (bounds.fBottom - 1);
+
+    alphas[0] = ScalarToAlpha(partialL * partialT);
+    alphas[1] = ScalarToAlpha(partialT);
+    alphas[bounds.width() - 1] = ScalarToAlpha(partialR * partialT);
+    this->blitAntiH(bounds.fLeft, bounds.fTop, alphas, runs);
+
+    this->blitAntiRect(bounds.fLeft, bounds.fTop + 1, bounds.width() - 2, bounds.height() - 2,
+                       ScalarToAlpha(partialL), ScalarToAlpha(partialR));
+
+    alphas[0] = ScalarToAlpha(partialL * partialB);
+    alphas[1] = ScalarToAlpha(partialB);
+    alphas[bounds.width() - 1] = ScalarToAlpha(partialR * partialB);
+    this->blitAntiH(bounds.fLeft, bounds.fBottom - 1, alphas, runs);
+}
+
+void SkBlitter::blitCoverageDeltas(SkCoverageDeltaList* deltas, const SkIRect& clip,
+                                   bool isEvenOdd, bool isInverse, bool isConvex) {
+    int         runSize = clip.width() + 1; // +1 so we can set runs[clip.width()] = 0
+    void*       storage = this->allocBlitMemory(runSize * (sizeof(int16_t) + sizeof(SkAlpha)));
+    int16_t*    runs    = reinterpret_cast<int16_t*>(storage);
+    SkAlpha*    alphas  = reinterpret_cast<SkAlpha*>(runs + runSize);
+    runs[clip.width()]  = 0; // we must set the last run to 0 so blitAntiH can stop there
+
+    bool canUseMask = !deltas->forceRLE() &&
+                      SkCoverageDeltaMask::CanHandle(SkIRect::MakeLTRB(0, 0, clip.width(), 1));
+    const SkAntiRect& antiRect = deltas->getAntiRect();
+    for(int y = deltas->top(); y < deltas->bottom(); ++y) {
+        // If antiRect is non-empty and we're at its top row, blit it and skip to the bottom
+        if (antiRect.fHeight && y == antiRect.fY) {
+            this->blitAntiRect(antiRect.fX, antiRect.fY, antiRect.fWidth, antiRect.fHeight,
+                               antiRect.fLeftAlpha, antiRect.fRightAlpha);
+            y += antiRect.fHeight - 1; // -1 because ++y in the for loop
+            continue;
+        }
+
+        // If there are too many deltas, sorting will be slow. Using a mask will be much faster.
+        // This is such an important optimization that will bring ~2x speedup for benches like
+        // path_fill_small_long_line and path_stroke_small_sawtooth.
+        if (canUseMask && !deltas->sorted(y) && deltas->count(y) << 3 >= clip.width()) {
+            SkIRect rowIR = SkIRect::MakeLTRB(clip.fLeft, y, clip.fRight, y + 1);
+            SkCoverageDeltaMask mask(rowIR);
+            for(int i = 0; i < deltas->count(y); ++i) {
+                const SkCoverageDelta& delta = deltas->getDelta(y, i);
+                mask.addDelta(delta.fX, y, delta.fDelta);
+            }
+            mask.convertCoverageToAlpha(isEvenOdd, isInverse, isConvex);
+            this->blitMask(mask.prepareSkMask(), rowIR);
+            continue;
+        }
+
+        // The normal flow of blitting deltas starts from here. First sort deltas.
+        deltas->sort(y);
+
+        int     i = 0;              // init delta index to 0
+        int     lastX = clip.fLeft; // init x to clip.fLeft
+        SkFixed coverage = 0;       // init coverage to 0
+
+        // skip deltas with x less than clip.fLeft; they must be precision errors
+        for(; i < deltas->count(y) && deltas->getDelta(y, i).fX < clip.fLeft; ++i);
+        for(; i < deltas->count(y) && deltas->getDelta(y, i).fX < clip.fRight; ++i) {
+            const SkCoverageDelta& delta = deltas->getDelta(y, i);
+            SkASSERT(delta.fX >= lastX);    // delta must be x sorted
+            if (delta.fX > lastX) {         // we have proceeded to a new x (different from lastX)
+                SkAlpha alpha = isConvex ? ConvexCoverageToAlpha(coverage, isInverse)
+                                         : CoverageToAlpha(coverage, isEvenOdd, isInverse);
+                alphas[lastX - clip.fLeft]  = alpha;            // set alpha at lastX
+                runs[lastX - clip.fLeft]    = delta.fX - lastX; // set the run length
+                lastX                       = delta.fX;         // now set lastX to current x
+            }
+            coverage += delta.fDelta; // cumulate coverage with the current delta
+        }
+
+        // Set the alpha and run length from the right-most delta to the right clip boundary
+        SkAlpha alpha = isConvex ? ConvexCoverageToAlpha(coverage, isInverse)
+                                 : CoverageToAlpha(coverage, isEvenOdd, isInverse);
+        alphas[lastX - clip.fLeft]  = alpha;
+        runs[lastX - clip.fLeft]    = clip.fRight - lastX;
+
+        this->blitAntiH(clip.fLeft, y, alphas, runs); // finally blit the current row
+    }
+}
+
 void SkBlitter::blitV(int x, int y, int height, SkAlpha alpha) {
     if (alpha == 255) {
         this->blitRect(x, y, 1, height);
diff --git a/src/core/SkBlitter.h b/src/core/SkBlitter.h
index 921e8f3..c280ac3 100644
--- a/src/core/SkBlitter.h
+++ b/src/core/SkBlitter.h
@@ -11,6 +11,7 @@
 #include "SkAutoMalloc.h"
 #include "SkBitmapProcShader.h"
 #include "SkColor.h"
+#include "SkCoverageDelta.h"
 #include "SkRect.h"
 #include "SkRegion.h"
 #include "SkShaderBase.h"
@@ -31,6 +32,12 @@
 public:
     virtual ~SkBlitter();
 
+    // The actual blitter may speedup the process by rewriting this in a more efficient way.
+    // For example, one may avoid some virtual blitAntiH calls by directly calling
+    // SkBlitRow::Color32.
+    virtual void blitCoverageDeltas(SkCoverageDeltaList* deltas, const SkIRect& clip,
+                                    bool isEvenOdd, bool isInverse, bool isConvex);
+
     /// Blit a horizontal run of one or more pixels.
     virtual void blitH(int x, int y, int width) = 0;
 
@@ -62,6 +69,9 @@
     virtual void blitAntiRect(int x, int y, int width, int height,
                               SkAlpha leftAlpha, SkAlpha rightAlpha);
 
+    // Blit a rect in AA with size at least 3 x 3 (small rect has too many edge cases...)
+    void blitFatAntiRect(const SkRect& rect);
+
     /// Blit a pattern of pixels defined by a rectangle-clipped mask;
     /// typically used for text.
     virtual void blitMask(const SkMask&, const SkIRect& clip);
diff --git a/src/core/SkCoverageDelta.cpp b/src/core/SkCoverageDelta.cpp
new file mode 100644
index 0000000..8f109ce
--- /dev/null
+++ b/src/core/SkCoverageDelta.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkCoverageDelta.h"
+
+SkCoverageDeltaList::SkCoverageDeltaList(SkCoverageDeltaAllocator* alloc, int top, int bottom,
+                                         bool forceRLE) {
+    fAlloc              = alloc;
+    fTop                = top;
+    fBottom             = bottom;
+    fForceRLE           = forceRLE;
+
+    // Init the anti-rect to be empty
+    fAntiRect.fY        = bottom;
+    fAntiRect.fHeight   = 0;
+
+    if (bottom - top <= RESERVED_HEIGHT) {
+        fSorted     = fReservedSorted;
+        fCounts     = fReservedCounts;
+        fMaxCounts  = fReservedMaxCounts;
+        fRows       = fReservedRows - top;
+        fRows[top]  = fReservedStorage;
+    } else {
+        fSorted     = fAlloc->makeArrayDefault<bool>(bottom - top);
+        fCounts     = fAlloc->makeArrayDefault<int>((bottom - top) * 2);
+        fMaxCounts  = fCounts + bottom - top;
+        fRows       = fAlloc->makeArrayDefault<SkCoverageDelta*>(bottom - top) - top;
+        fRows[top]  = fAlloc->makeArrayDefault<SkCoverageDelta>(INIT_ROW_SIZE * (bottom - top));
+    }
+
+    memset(fSorted, true, bottom - top);
+    memset(fCounts, 0, sizeof(int) * (bottom - top));
+
+    // Minus top so we can directly use fCounts[y] instead of fCounts[y - fTop].
+    // Same for fMaxCounts, fRows, and fSorted.
+    fSorted    -= top;
+    fCounts    -= top;
+    fMaxCounts -= top;
+
+    for(int y = top; y < bottom; ++y) {
+        fMaxCounts[y] = INIT_ROW_SIZE;
+    }
+    for(int y = top + 1; y < bottom; ++y) {
+        fRows[y] = fRows[y - 1] + INIT_ROW_SIZE;
+    }
+}
+
+int SkCoverageDeltaMask::ExpandWidth(int width) {
+    int result = width + PADDING * 2;
+    return result + (SIMD_WIDTH - result % SIMD_WIDTH) % SIMD_WIDTH;
+}
+
+bool SkCoverageDeltaMask::CanHandle(const SkIRect& bounds) {
+    // Expand width so we don't have to worry about the boundary
+    return ExpandWidth(bounds.width()) * bounds.height() + PADDING * 2 < MAX_MASK_SIZE;
+}
+
+bool SkCoverageDeltaMask::Suitable(const SkIRect& bounds) {
+    return bounds.width() <= SUITABLE_WIDTH && CanHandle(bounds);
+}
+
+SkCoverageDeltaMask::SkCoverageDeltaMask(const SkIRect& bounds) : fBounds(bounds) {
+    SkASSERT(CanHandle(bounds));
+
+    // Init the anti-rect to be empty
+    fAntiRect.fY        = fBounds.fBottom;
+    fAntiRect.fHeight   = 0;
+
+    fExpandedWidth      = ExpandWidth(fBounds.width());
+
+    // Add PADDING columns so we may access fDeltas[index(-PADDING, 0)]
+    // Minus index(fBounds.fLeft, fBounds.fTop) so we can directly access fDeltas[index(x, y)]
+    fDeltas             = fDeltaStorage + PADDING - this->index(fBounds.fLeft, fBounds.fTop);
+
+    memset(fDeltaStorage, 0, (fExpandedWidth * bounds.height() + PADDING * 2) * sizeof(SkFixed));;
+}
+
+void SkCoverageDeltaMask::convertCoverageToAlpha(bool isEvenOdd, bool isInverse, bool isConvex) {
+    SkFixed* deltaRow = &this->delta(fBounds.fLeft, fBounds.fTop);
+    SkAlpha* maskRow = fMask;
+    for(int iy = 0; iy < fBounds.height(); ++iy) {
+        // If we're inside fAntiRect, blit it to the mask and advance to its bottom
+        if (fAntiRect.fHeight && iy == fAntiRect.fY - fBounds.fTop) {
+            // Blit the mask
+            int L = fAntiRect.fX - fBounds.fLeft;
+            for(int i = 0; i < fAntiRect.fHeight; ++i) {
+                SkAlpha* tMask = maskRow + L;
+                if (fAntiRect.fLeftAlpha) {
+                    tMask[0] = fAntiRect.fLeftAlpha;
+                }
+                memset(tMask + 1, 0xff, fAntiRect.fWidth);
+                if (fAntiRect.fRightAlpha) {
+                    tMask[fAntiRect.fWidth + 1] = fAntiRect.fRightAlpha;
+                }
+                maskRow += fBounds.width();
+            }
+
+            // Advance to the bottom (maskRow is already advanced to the bottom).
+            deltaRow    += fExpandedWidth * fAntiRect.fHeight;
+            iy          += fAntiRect.fHeight - 1; // -1 because we'll ++iy after continue
+            continue;
+        }
+
+        // Otherwise, cumulate deltas into coverages, and convert them into alphas
+        SkFixed c[SIMD_WIDTH] = {0}; // prepare SIMD_WIDTH coverages at a time
+        for(int ix = 0; ix < fExpandedWidth; ix += SIMD_WIDTH) {
+            // Future todo: is it faster to process SIMD_WIDTH rows at a time so we can use SIMD
+            // for coverage accumulation?
+
+            // Cumulate deltas to get SIMD_WIDTH new coverages
+            c[0] = c[SIMD_WIDTH - 1] + deltaRow[ix];
+            for(int j = 1; j < SIMD_WIDTH; ++j) {
+                c[j] = c[j - 1] + deltaRow[ix + j];
+            }
+
+            // My SIMD CoverageToAlpha seems to be only faster with SSSE3.
+            // (On linux, even with -mavx2, my SIMD still seems to be slow...)
+            // Even with only SSSE2, it's still faster to do SIMD_WIDTH non-SIMD computations at one
+            // time (i.e., SIMD_WIDTH = 8 is faster than SIMD_WIDTH = 1 even if SK_CPU_SSE_LEVEL is
+            // less than SK_CPU_SSE_LEVEL_SSSE3). Maybe the compiler is doing some SIMD by itself.
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+            using SkNi = SkNx<SIMD_WIDTH, int>;
+
+            SkNi cn = SkNi::Load(c);
+            SkNi an = isConvex ? ConvexCoverageToAlpha(cn, isInverse)
+                               : CoverageToAlpha(cn, isEvenOdd, isInverse);
+            SkNx_cast<SkAlpha>(an).store(maskRow + ix);
+#else
+            for(int j = 0; j < SIMD_WIDTH; ++j) {
+                maskRow[ix + j] = isConvex ? ConvexCoverageToAlpha(c[j], isInverse)
+                                           : CoverageToAlpha(c[j], isEvenOdd, isInverse);
+            }
+#endif
+        }
+
+        // Finally, advance to the next row
+        deltaRow    += fExpandedWidth;
+        maskRow     += fBounds.width();
+    }
+}
diff --git a/src/core/SkCoverageDelta.h b/src/core/SkCoverageDelta.h
new file mode 100644
index 0000000..e0ecb9d
--- /dev/null
+++ b/src/core/SkCoverageDelta.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkCoverageDelta_DEFINED
+#define SkCoverageDelta_DEFINED
+
+#include "SkArenaAlloc.h"
+#include "SkFixed.h"
+#include "SkMask.h"
+#include "SkTSort.h"
+#include "SkUtils.h"
+
+// Future todo: maybe we can make fX and fDelta 16-bit long to speed it up a little bit.
+struct SkCoverageDelta {
+    int     fX;     // the y coordinate will be implied in SkCoverageDeltaList
+    SkFixed fDelta; // the amount that the alpha changed
+
+    // Sort according to fX
+    bool operator<(const SkCoverageDelta& other) const {
+        return fX < other.fX;
+    }
+};
+
+// All the arguments needed for SkBlitter::blitAntiRect
+struct SkAntiRect {
+    int     fX;
+    int     fY;
+    int     fWidth;
+    int     fHeight;
+    SkAlpha fLeftAlpha;
+    SkAlpha fRightAlpha;
+};
+
+using SkCoverageDeltaAllocator = SkSTArenaAlloc<256>;
+
+// A list of SkCoverageDelta with y from top() to bottom().
+// For each row y, there are count(y) number of deltas.
+// You can ask whether they are sorted or not by sorted(y), and you can sort them by sort(y).
+// Once sorted, getDelta(y, i) should return the i-th leftmost delta on row y.
+class SkCoverageDeltaList {
+public:
+    // We can store INIT_ROW_SIZE deltas per row (i.e., per y-scanline) initially
+    static constexpr int INIT_ROW_SIZE = 32;
+    static constexpr int RESERVED_HEIGHT = 128; // reserve this many rows on stack memory
+
+    SkCoverageDeltaList(SkCoverageDeltaAllocator* alloc, int top, int bottom, bool forceRLE);
+
+    inline int  top() const { return fTop; }
+    inline int  bottom() const { return fBottom; }
+    inline bool forceRLE() const { return fForceRLE; }
+    inline int  count(int y) const { this->checkY(y); return fCounts[y]; }
+    inline bool sorted(int y) const { this->checkY(y); return fSorted[y]; }
+    inline void addDelta(int x, int y, SkFixed delta) { this->push_back(y, {x, delta}); }
+
+    inline const SkCoverageDelta& getDelta(int y, int i) const {
+        this->checkY(y);
+        SkASSERT(i < fCounts[y]);
+        return fRows[y][i];
+    }
+
+    // It might be better to sort right before blitting to make the memory hot
+    inline void sort(int y) {
+        this->checkY(y);
+        if (!fSorted[y]) {
+            SkTQSort(fRows[y], fRows[y] + fCounts[y] - 1);
+            fSorted[y] = true;
+        }
+    }
+
+    inline const SkAntiRect& getAntiRect() const { return fAntiRect; }
+    inline void setAntiRect(int x, int y, int width, int height,
+            SkAlpha leftAlpha, SkAlpha rightAlpha) {
+        fAntiRect = {x, y, width, height, leftAlpha, rightAlpha};
+    }
+
+    inline void push_back(int y, const SkCoverageDelta& delta) {
+        this->checkY(y);
+        if (fCounts[y] == fMaxCounts[y]) {
+            fMaxCounts[y] *= 2;
+            SkCoverageDelta* newRow = fAlloc->makeArrayDefault<SkCoverageDelta>(fMaxCounts[y]);
+            memcpy(newRow, fRows[y], sizeof(SkCoverageDelta) * fCounts[y]);
+            fRows[y] = newRow;
+        }
+        SkASSERT(fCounts[y] < fMaxCounts[y]);
+        fRows[y][fCounts[y]++] = delta;
+        fSorted[y] = fSorted[y] && (fCounts[y] == 1 || delta.fX >= fRows[y][fCounts[y] - 2].fX);
+    }
+
+private:
+    SkCoverageDeltaAllocator*   fAlloc;
+    SkCoverageDelta**           fRows;
+    bool*                       fSorted;
+    int*                        fCounts;
+    int*                        fMaxCounts;
+    int                         fTop;
+    int                         fBottom;
+    SkAntiRect                  fAntiRect;
+    bool                        fForceRLE;
+
+    SkCoverageDelta             fReservedStorage[RESERVED_HEIGHT * INIT_ROW_SIZE];
+    SkCoverageDelta*            fReservedRows[RESERVED_HEIGHT];
+    bool                        fReservedSorted[RESERVED_HEIGHT];
+    int                         fReservedCounts[RESERVED_HEIGHT];
+    int                         fReservedMaxCounts[RESERVED_HEIGHT];
+
+    inline void checkY(int y) const { SkASSERT(y >= fTop && y < fBottom); }
+};
+
+class SkCoverageDeltaMask {
+public:
+    // 1 for precision error, 1 for boundary delta (e.g., -SK_Fixed1 at fBounds.fRight + 1)
+    static constexpr int PADDING        = 2;
+
+    static constexpr int SIMD_WIDTH     = 8;
+    static constexpr int SUITABLE_WIDTH = 32;
+    static constexpr int MAX_MASK_SIZE  = 2048;
+
+    // Expand PADDING on both sides, and make it a multiple of SIMD_WIDTH
+    static int  ExpandWidth(int width);
+    static bool CanHandle(const SkIRect& bounds);   // whether bounds fits into MAX_MASK_SIZE
+    static bool Suitable(const SkIRect& bounds);    // CanHandle(bounds) && width <= SUITABLE_WIDTH
+
+    SkCoverageDeltaMask(const SkIRect& bounds);
+
+    inline int              top()       const { return fBounds.fTop; }
+    inline int              bottom()    const { return fBounds.fBottom; }
+    inline SkAlpha*         getMask()         { return fMask; }
+    inline const SkIRect&   getBounds() const { return fBounds; }
+
+    inline void             addDelta (int x, int y, SkFixed delta) { this->delta(x, y) += delta; }
+    inline SkFixed&         delta    (int x, int y) {
+        this->checkX(x);
+        this->checkY(y);
+        return fDeltas[this->index(x, y)];
+    }
+
+    inline void setAntiRect(int x, int y, int width, int height,
+                            SkAlpha leftAlpha, SkAlpha rightAlpha) {
+        fAntiRect = {x, y, width, height, leftAlpha, rightAlpha};
+    }
+
+    inline SkMask prepareSkMask() {
+        SkMask mask;
+        mask.fImage     = fMask;
+        mask.fBounds    = fBounds;
+        mask.fRowBytes  = fBounds.width();
+        mask.fFormat    = SkMask::kA8_Format;
+        return mask;
+    }
+
+    void convertCoverageToAlpha(bool isEvenOdd, bool isInverse, bool isConvex);
+
+private:
+    SkIRect     fBounds;
+    SkFixed     fDeltaStorage[MAX_MASK_SIZE];
+    SkFixed*    fDeltas;
+    SkAlpha     fMask[MAX_MASK_SIZE];
+    int         fExpandedWidth;
+    SkAntiRect  fAntiRect;
+
+    inline int  index(int x, int y) const { return y * fExpandedWidth + x; }
+    inline void checkY(int y) const { SkASSERT(y >= fBounds.fTop && y < fBounds.fBottom); }
+    inline void checkX(int x) const {
+        SkASSERT(x >= fBounds.fLeft - PADDING && x < fBounds.fRight + PADDING);
+    }
+};
+
+static SK_ALWAYS_INLINE SkAlpha CoverageToAlpha(SkFixed coverage, bool isEvenOdd, bool isInverse) {
+    SkAlpha result;
+    if (isEvenOdd) {
+        SkFixed mod17 = coverage & 0x1ffff;
+        SkFixed mod16 = coverage & 0xffff;
+        result = SkTPin(SkAbs32((mod16 << 1) - mod17) >> 8, 0, 255);
+    } else {
+        result = SkTPin(SkAbs32(coverage) >> 8, 0, 255);
+    }
+    return isInverse ? 255 - result : result;
+}
+
+template<typename T>
+static SK_ALWAYS_INLINE T CoverageToAlpha(T coverage, bool isEvenOdd, bool isInverse) {
+    T t0(0), t255(255);
+    T result;
+    if (isEvenOdd) {
+        T mod17 = coverage & 0x1ffff;
+        T mod16 = coverage & 0xffff;
+        result = ((mod16 << 1) - mod17).abs() >> 8;
+    } else {
+        result = coverage.abs() >> 8;;
+    }
+    result = T::Min(result, t255);
+    result = T::Max(result, t0);
+    return isInverse ? 255 - result : result;
+}
+
+// For convex paths (including inverse mode), the coverage is guaranteed to be
+// between [-SK_Fixed1, SK_Fixed1] so we can skip isEvenOdd and SkTPin.
+static SK_ALWAYS_INLINE SkAlpha ConvexCoverageToAlpha(SkFixed coverage, bool isInverse) {
+    SkASSERT(coverage >= -SK_Fixed1 && coverage <= SK_Fixed1);
+    int result = SkAbs32(coverage) >> 8;
+    result -= (result >> 8); // 256 to 255
+    return isInverse ? 255 - result : result;
+}
+
+template<typename T>
+static SK_ALWAYS_INLINE T ConvexCoverageToAlpha(T coverage, bool isInverse) {
+    // allTrue is not implemented
+    // SkASSERT((coverage >= 0).allTrue() && (coverage <= SK_Fixed1).allTrue());
+    T result = coverage.abs() >> 8;
+    result -= (result >> 8); // 256 to 255
+    return isInverse ? 255 - result : result;
+}
+
+#endif
diff --git a/src/core/SkEdgeBuilder.cpp b/src/core/SkEdgeBuilder.cpp
index 38519304..7a02187 100644
--- a/src/core/SkEdgeBuilder.cpp
+++ b/src/core/SkEdgeBuilder.cpp
@@ -65,7 +65,7 @@
 
 SkEdgeBuilder::Combine SkEdgeBuilder::CombineVertical(
         const SkAnalyticEdge* edge, SkAnalyticEdge* last) {
-    SkASSERT(fAnalyticAA);
+    SkASSERT(fEdgeType == kAnalyticEdge);
     if (last->fCurveCount || last->fDX || edge->fX != last->fX) {
         return kNo_Combine;
     }
@@ -115,12 +115,17 @@
 }
 
 bool SkEdgeBuilder::vertical_line(const SkAnalyticEdge* edge) {
-    SkASSERT(fAnalyticAA);
+    SkASSERT(fEdgeType == kAnalyticEdge);
     return !edge->fDX && !edge->fCurveCount;
 }
 
 void SkEdgeBuilder::addLine(const SkPoint pts[]) {
-    if (fAnalyticAA) {
+    if (fEdgeType == kBezier) {
+        SkLine* line = fAlloc.make<SkLine>();
+        if (line->set(pts)) {
+            fList.push(line);
+        }
+    } else if (fEdgeType == kAnalyticEdge) {
         SkAnalyticEdge* edge = fAlloc.make<SkAnalyticEdge>();
         if (edge->setLine(pts[0], pts[1])) {
             if (vertical_line(edge) && fList.count()) {
@@ -160,7 +165,12 @@
 }
 
 void SkEdgeBuilder::addQuad(const SkPoint pts[]) {
-    if (fAnalyticAA) {
+    if (fEdgeType == kBezier) {
+        SkQuad* quad = fAlloc.make<SkQuad>();
+        if (quad->set(pts)) {
+            fList.push(quad);
+        }
+    } else if (fEdgeType == kAnalyticEdge) {
         SkAnalyticQuadraticEdge* edge = fAlloc.make<SkAnalyticQuadraticEdge>();
         if (edge->setQuadratic(pts)) {
             fList.push(edge);
@@ -178,7 +188,12 @@
 }
 
 void SkEdgeBuilder::addCubic(const SkPoint pts[]) {
-    if (fAnalyticAA) {
+    if (fEdgeType == kBezier) {
+        SkCubic* cubic = fAlloc.make<SkCubic>();
+        if (cubic->set(pts)) {
+            fList.push(cubic);
+        }
+    } else if (fEdgeType == kAnalyticEdge) {
         SkAnalyticCubicEdge* edge = fAlloc.make<SkAnalyticCubicEdge>();
         if (edge->setCubic(pts)) {
             fList.push(edge);
@@ -232,7 +247,7 @@
 
 SkEdgeBuilder::Combine SkEdgeBuilder::checkVertical(const SkAnalyticEdge* edge,
         SkAnalyticEdge** edgePtr) {
-    SkASSERT(fAnalyticAA);
+    SkASSERT(fEdgeType == kAnalyticEdge);
     return !vertical_line(edge) || edgePtr <= (SkAnalyticEdge**)fEdgeList ? kNo_Combine :
             CombineVertical(edge, edgePtr[-1]);
 }
@@ -251,9 +266,22 @@
         maxEdgeCount *= SkLineClipper::kMaxClippedLineSegments;
     }
 
-    size_t edgeSize = fAnalyticAA ? sizeof(SkAnalyticEdge) : sizeof(SkEdge);
-    char* edge = fAnalyticAA ? (char*)fAlloc.makeArrayDefault<SkAnalyticEdge>(maxEdgeCount)
-                             : (char*)fAlloc.makeArrayDefault<SkEdge>(maxEdgeCount);
+    size_t edgeSize;
+    char* edge;
+    switch (fEdgeType) {
+        case kEdge:
+            edgeSize = sizeof(SkEdge);
+            edge = (char*)fAlloc.makeArrayDefault<SkEdge>(maxEdgeCount);
+            break;
+        case kAnalyticEdge:
+            edgeSize = sizeof(SkAnalyticEdge);
+            edge = (char*)fAlloc.makeArrayDefault<SkAnalyticEdge>(maxEdgeCount);
+            break;
+        case kBezier:
+            edgeSize = sizeof(SkLine);
+            edge = (char*)fAlloc.makeArrayDefault<SkLine>(maxEdgeCount);
+            break;
+    }
 
     SkDEBUGCODE(char* edgeStart = edge);
     char** edgePtr = fAlloc.makeArrayDefault<char*>(maxEdgeCount);
@@ -275,20 +303,7 @@
                     int lineCount = SkLineClipper::ClipLine(pts, clip, lines, canCullToTheRight);
                     SkASSERT(lineCount <= SkLineClipper::kMaxClippedLineSegments);
                     for (int i = 0; i < lineCount; i++) {
-                        bool setLineResult = fAnalyticAA ?
-                                ((SkAnalyticEdge*)edge)->setLine(lines[i], lines[i + 1]) :
-                                ((SkEdge*)edge)->setLine(lines[i], lines[i + 1], shiftUp);
-                        if (setLineResult) {
-                            Combine combine = fAnalyticAA ?
-                                    checkVertical((SkAnalyticEdge*)edge, (SkAnalyticEdge**)edgePtr) :
-                                    checkVertical((SkEdge*)edge, (SkEdge**)edgePtr);
-                            if (kNo_Combine == combine) {
-                                *edgePtr++ = edge;
-                                edge += edgeSize;
-                            } else if (kTotal_Combine == combine) {
-                                --edgePtr;
-                            }
-                        }
+                        this->addPolyLine(lines + i, edge, edgeSize, edgePtr, shiftUp);
                     }
                     break;
                 }
@@ -306,20 +321,7 @@
                     // the corresponding line/quad/cubic verbs
                     break;
                 case SkPath::kLine_Verb: {
-                    bool setLineResult = fAnalyticAA ?
-                            ((SkAnalyticEdge*)edge)->setLine(pts[0], pts[1]) :
-                            ((SkEdge*)edge)->setLine(pts[0], pts[1], shiftUp);
-                    if (setLineResult) {
-                        Combine combine = fAnalyticAA ?
-                                checkVertical((SkAnalyticEdge*)edge, (SkAnalyticEdge**)edgePtr) :
-                                checkVertical((SkEdge*)edge, (SkEdge**)edgePtr);
-                        if (kNo_Combine == combine) {
-                            *edgePtr++ = edge;
-                            edge += edgeSize;
-                        } else if (kTotal_Combine == combine) {
-                            --edgePtr;
-                        }
-                    }
+                    this->addPolyLine(pts, edge, edgeSize, edgePtr, shiftUp);
                     break;
                 }
                 default:
@@ -342,11 +344,11 @@
 }
 
 int SkEdgeBuilder::build(const SkPath& path, const SkIRect* iclip, int shiftUp,
-                         bool canCullToTheRight, bool analyticAA) {
+                         bool canCullToTheRight, EdgeType edgeType) {
     fAlloc.reset();
     fList.reset();
     fShiftUp = shiftUp;
-    fAnalyticAA = analyticAA;
+    fEdgeType = edgeType;
 
     if (SkPath::kLine_SegmentMask == path.getSegmentMasks()) {
         return this->buildPoly(path, iclip, shiftUp, canCullToTheRight);
@@ -443,12 +445,12 @@
 }
 
 int SkEdgeBuilder::build_edges(const SkPath& path, const SkIRect* shiftedClip,
-        int shiftEdgesUp, bool pathContainedInClip, bool analyticAA) {
+        int shiftEdgesUp, bool pathContainedInClip, EdgeType edgeType) {
     // If we're convex, then we need both edges, even the right edge is past the clip
     const bool canCullToTheRight = !path.isConvex();
 
     const SkIRect* builderClip = pathContainedInClip ? nullptr : shiftedClip;
-    int count = this->build(path, builderClip, shiftEdgesUp, canCullToTheRight, analyticAA);
+    int count = this->build(path, builderClip, shiftEdgesUp, canCullToTheRight, edgeType);
     SkASSERT(count >= 0);
 
     // canCullToRight == false should imply count != 1
diff --git a/src/core/SkEdgeBuilder.h b/src/core/SkEdgeBuilder.h
index 2f2b351..b876fcf 100644
--- a/src/core/SkEdgeBuilder.h
+++ b/src/core/SkEdgeBuilder.h
@@ -10,6 +10,8 @@
 #include "SkArenaAlloc.h"
 #include "SkRect.h"
 #include "SkTDArray.h"
+#include "SkEdge.h"
+#include "SkAnalyticEdge.h"
 
 struct SkEdge;
 struct SkAnalyticEdge;
@@ -18,18 +20,27 @@
 
 class SkEdgeBuilder {
 public:
+    enum EdgeType {
+        kEdge,
+        kAnalyticEdge,
+        kBezier
+    };
+
+    // static constexpr int kEdgeSizes[3] = {sizeof(SkEdge), sizeof(SkAnalyticEdge), sizeof(SkBezier)};
+
     SkEdgeBuilder();
 
     // returns the number of built edges. The array of those edge pointers
     // is returned from edgeList().
     int build(const SkPath& path, const SkIRect* clip, int shiftUp, bool clipToTheRight,
-              bool analyticAA = false);
+              EdgeType edgeType = kEdge);
 
     int build_edges(const SkPath& path, const SkIRect* shiftedClip,
-            int shiftEdgesUp, bool pathContainedInClip, bool analyticAA = false);
+            int shiftEdgesUp, bool pathContainedInClip, EdgeType edgeType = kEdge);
 
     SkEdge** edgeList() { return (SkEdge**)fEdgeList; }
     SkAnalyticEdge** analyticEdgeList() { return (SkAnalyticEdge**)fEdgeList; }
+    SkBezier** bezierList() { return (SkBezier**)fEdgeList; }
 
 private:
     enum Combine {
@@ -57,7 +68,7 @@
     void**      fEdgeList;
 
     int         fShiftUp;
-    bool        fAnalyticAA;
+    EdgeType    fEdgeType;
 
 public:
     void addLine(const SkPoint pts[]);
@@ -66,6 +77,32 @@
     void addClipper(SkEdgeClipper*);
 
     int buildPoly(const SkPath& path, const SkIRect* clip, int shiftUp, bool clipToTheRight);
+
+    inline void addPolyLine(SkPoint pts[], char* &edge, size_t edgeSize, char** &edgePtr,
+            int shiftUp) {
+        if (fEdgeType == kBezier) {
+            if (((SkLine*)edge)->set(pts)) {
+                *edgePtr++ = edge;
+                edge += edgeSize;
+            }
+            return;
+        }
+        bool analyticAA = fEdgeType == kAnalyticEdge;
+        bool setLineResult = analyticAA ?
+                ((SkAnalyticEdge*)edge)->setLine(pts[0], pts[1]) :
+                ((SkEdge*)edge)->setLine(pts[0], pts[1], shiftUp);
+        if (setLineResult) {
+            Combine combine = analyticAA ?
+                    checkVertical((SkAnalyticEdge*)edge, (SkAnalyticEdge**)edgePtr) :
+                    checkVertical((SkEdge*)edge, (SkEdge**)edgePtr);
+            if (kNo_Combine == combine) {
+                *edgePtr++ = edge;
+                edge += edgeSize;
+            } else if (kTotal_Combine == combine) {
+                --edgePtr;
+            }
+        }
+    }
 };
 
 #endif
diff --git a/src/core/SkScan.cpp b/src/core/SkScan.cpp
index c2f00f3..b62c972 100644
--- a/src/core/SkScan.cpp
+++ b/src/core/SkScan.cpp
@@ -18,6 +18,10 @@
 
 std::atomic<bool> gSkForceAnalyticAA{false};
 
+std::atomic<bool> gSkUseDeltaAA{false};
+
+std::atomic<bool> gSkForceDeltaAA{false};
+
 static inline void blitrect(SkBlitter* blitter, const SkIRect& r) {
     blitter->blitRect(r.fLeft, r.fTop, r.width(), r.height());
 }
diff --git a/src/core/SkScan.h b/src/core/SkScan.h
index e5c6047..8bb9d1f 100644
--- a/src/core/SkScan.h
+++ b/src/core/SkScan.h
@@ -23,6 +23,8 @@
 */
 typedef SkIRect SkXRect;
 
+extern std::atomic<bool> gSkUseDeltaAA;
+extern std::atomic<bool> gSkForceDeltaAA;
 extern std::atomic<bool> gSkUseAnalyticAA;
 extern std::atomic<bool> gSkForceAnalyticAA;
 
@@ -89,6 +91,8 @@
     static void AntiHairLineRgn(const SkPoint[], int count, const SkRegion*, SkBlitter*);
     static void AAAFillPath(const SkPath& path, const SkRegion& origClip, SkBlitter* blitter,
                             bool forceRLE = false); // SkAAClip uses forceRLE
+    static void DAAFillPath(const SkPath& path, const SkRegion& origClip, SkBlitter* blitter,
+                            bool forceRLE = false);
 };
 
 /** Assign an SkXRect from a SkIRect, by promoting the src rect's coordinates
diff --git a/src/core/SkScan_AAAPath.cpp b/src/core/SkScan_AAAPath.cpp
index 8917c7f..0da39d5 100644
--- a/src/core/SkScan_AAAPath.cpp
+++ b/src/core/SkScan_AAAPath.cpp
@@ -1575,7 +1575,8 @@
     SkASSERT(blitter);
 
     SkEdgeBuilder builder;
-    int count = builder.build_edges(path, &clipRect, 0, pathContainedInClip, true);
+    int count = builder.build_edges(path, &clipRect, 0, pathContainedInClip,
+                                    SkEdgeBuilder::kAnalyticEdge);
     SkAnalyticEdge** list = builder.analyticEdgeList();
 
     SkIRect rect = clipRect;
diff --git a/src/core/SkScan_AntiPath.cpp b/src/core/SkScan_AntiPath.cpp
index 69b5ba1..2a92b42 100644
--- a/src/core/SkScan_AntiPath.cpp
+++ b/src/core/SkScan_AntiPath.cpp
@@ -634,6 +634,14 @@
     }
 }
 
+static bool suitableForDAA(const SkPath& path) {
+    if (gSkForceAnalyticAA.load()) {
+        return true;
+    }
+    const SkRect& bounds = path.getBounds();
+    return !path.isConvex() && path.countPoints() >= SkTMax(bounds.width(), bounds.height()) / 8;
+}
+
 static bool suitableForAAA(const SkPath& path) {
     if (gSkForceAnalyticAA.load()) {
         return true;
@@ -659,9 +667,11 @@
     using FillPathProc = void(*)(const SkPath&, const SkRegion&, SkBlitter*, bool);
     FillPathProc fillPathProc = &SkScan::AntiFillPath;
 
-    // Do not use AAA if path is too complicated:
-    // there won't be any speedup or significant visual improvement.
-    if (gSkUseAnalyticAA.load() && suitableForAAA(path)) {
+    if (gSkUseDeltaAA.load() && suitableForDAA(path)) {
+        fillPathProc = &SkScan::DAAFillPath;
+    } else if (gSkUseAnalyticAA.load() && suitableForAAA(path)) {
+        // Do not use AAA if path is too complicated:
+        // there won't be any speedup or significant visual improvement.
         fillPathProc = &SkScan::AAAFillPath;
     }
 
@@ -673,6 +683,6 @@
 
         tmp.setRect(clip.getBounds());
         aaBlitter.init(blitter, &clip.aaRgn());
-        fillPathProc(path, tmp, &aaBlitter, true);
+        fillPathProc(path, tmp, &aaBlitter, true); // SkAAClipBlitter can blitMask, why forceRLE?
     }
 }
diff --git a/src/core/SkScan_DAAPath.cpp b/src/core/SkScan_DAAPath.cpp
new file mode 100644
index 0000000..295d621
--- /dev/null
+++ b/src/core/SkScan_DAAPath.cpp
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkAnalyticEdge.h"
+#include "SkAntiRun.h"
+#include "SkAutoMalloc.h"
+#include "SkBlitter.h"
+#include "SkCoverageDelta.h"
+#include "SkEdge.h"
+#include "SkEdgeBuilder.h"
+#include "SkGeometry.h"
+#include "SkMask.h"
+#include "SkPath.h"
+#include "SkQuadClipper.h"
+#include "SkRasterClip.h"
+#include "SkRegion.h"
+#include "SkScan.h"
+#include "SkScanPriv.h"
+#include "SkTSort.h"
+#include "SkTemplates.h"
+#include "SkUtils.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*
+
+DAA stands for Delta-based Anti-Aliasing.
+
+This is an improved version of our analytic AA algorithm (in SkScan_AAAPath.cpp)
+which first scan convert paths into coverage deltas (this step can happen out of order,
+and we don't seem to be needed to worry about the intersection, clamping, etc.),
+and then use a single blitter run to convert all those deltas into the final alphas.
+
+Before we go to the final blitter run, we'll use SkFixed for all delta values so we
+don't ever have to worry about under/overflow.
+
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+
+// The following helper functions are the same as those from SkScan_AAAPath.cpp
+// except that we use SkFixed everywhere instead of SkAlpha.
+
+static inline SkFixed SkFixedMul_lowprec(SkFixed a, SkFixed b) {
+    return (a >> 8) * (b >> 8);
+}
+
+// Return the alpha of a trapezoid whose height is 1
+static inline SkFixed trapezoidToAlpha(SkFixed l1, SkFixed l2) {
+    SkASSERT(l1 >= 0 && l2 >= 0);
+    return (l1 + l2) >> 1;
+}
+
+// The alpha of right-triangle (a, a*b)
+static inline SkFixed partialTriangleToAlpha(SkFixed a, SkFixed b) {
+    SkASSERT(a <= SK_Fixed1);
+    // SkFixedMul(SkFixedMul(a, a), b) >> 1
+    // return ((((a >> 8) * (a >> 8)) >> 8) * (b >> 8)) >> 1;
+    return (a >> 11) * (a >> 11) * (b >> 11);
+}
+
+static inline SkFixed getPartialAlpha(SkFixed alpha, SkFixed partialMultiplier) {
+    // DAA should not be so sensitive to the precision (compared to AAA).
+    return SkFixedMul_lowprec(alpha, partialMultiplier);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template<bool isPartial, class Deltas>
+static inline void add_coverage_delta_segment(int y, SkFixed rowHeight, const SkAnalyticEdge* edge,
+        SkFixed nextX, Deltas* deltas) { // rowHeight=fullAlpha
+    SkASSERT(rowHeight <= SK_Fixed1 && rowHeight >= 0);
+
+    // Let's see if multiplying sign is faster than multiplying edge->fWinding.
+    // (Compiler should be able to optimize multiplication with 1/-1?)
+    int sign = edge->fWinding == 1 ? 1 : -1;
+
+    SkFixed l = SkTMin(edge->fX, nextX);
+    SkFixed r = edge->fX + nextX - l;
+    int     L = SkFixedFloorToInt(l);
+    int     R = SkFixedCeilToInt(r);
+    int     len = R - L;
+
+    switch (len) {
+        case 0: {
+            deltas->addDelta(L, y, rowHeight * sign);
+            return;
+        }
+        case 1: {
+            SkFixed fixedR  = SkIntToFixed(R);
+            SkFixed alpha   = trapezoidToAlpha(fixedR - l, fixedR - r);
+            if (isPartial) {
+                alpha = getPartialAlpha(alpha, rowHeight);
+            }
+            deltas->addDelta(L,     y,  alpha * sign);
+            deltas->addDelta(L + 1, y,  (rowHeight - alpha) * sign);
+            return;
+        }
+        case 2: {
+            SkFixed middle  = SkIntToFixed(L + 1);
+            SkFixed x1      = middle - l;
+            SkFixed x2      = r - middle;
+            SkFixed alpha1  = partialTriangleToAlpha(x1, edge->fDY);
+            SkFixed alpha2  = rowHeight - partialTriangleToAlpha(x2, edge->fDY);
+            deltas->addDelta(L,     y,  alpha1 * sign);
+            deltas->addDelta(L + 1, y,  (alpha2 - alpha1) * sign);
+            deltas->addDelta(L + 2, y,  (rowHeight - alpha2) * sign);
+            return;
+        }
+    }
+
+    // When len > 2, computations are similar to computeAlphaAboveLine in SkScan_AAAPath.cpp
+    SkFixed dY      = edge->fDY;
+    SkFixed fixedL  = SkIntToFixed(L);
+    SkFixed fixedR  = SkIntToFixed(R);
+    SkFixed first   = SK_Fixed1 + fixedL - l; // horizontal edge of the left-most triangle
+    SkFixed last    = r - (fixedR - SK_Fixed1); // horizontal edge of the right-most triangle
+    SkFixed firstH  = SkFixedMul_lowprec(first, dY); // vertical edge of the left-most triangle
+
+    SkFixed alpha0  = SkFixedMul_lowprec(first, firstH) >> 1;   // triangle alpha
+    SkFixed alpha1  = firstH + (dY >> 1);                       // rectangle plus triangle
+    deltas->addDelta(L, y, alpha0 * sign);
+    deltas->addDelta(L + 1, y, (alpha1 - alpha0) * sign);
+    for(int i = 2; i < len - 1; ++i) {
+        deltas->addDelta(L + i, y, dY * sign); // the increment is always a rect of height = dY
+    }
+
+    SkFixed alphaR2     = alpha1 + dY * (len - 3);                      // the alpha at R - 2
+    SkFixed lastAlpha   = rowHeight - partialTriangleToAlpha(last, dY); // the alpha at R - 1
+    deltas->addDelta(R - 1, y, (lastAlpha - alphaR2) * sign);
+    deltas->addDelta(R,     y, (rowHeight - lastAlpha) * sign);
+}
+
+class XLessThan {
+public:
+    bool operator()(const SkBezier* a, const SkBezier* b) {
+        return a->fP0.fX + a->fP1.fX < b->fP0.fX + b->fP1.fX;
+    }
+};
+
+class YLessThan {
+public:
+    bool operator()(const SkBezier* a, const SkBezier* b) {
+        return a->fP0.fY + a->fP1.fY < b->fP0.fY + b->fP1.fY;
+    }
+};
+
+template<class Deltas> static SK_ALWAYS_INLINE
+void gen_alpha_deltas(const SkPath& path, const SkRegion& clipRgn, Deltas& result,
+        SkBlitter* blitter, bool skipRect, bool pathContainedInClip) {
+    // 1. Build edges
+    SkEdgeBuilder builder;
+    SkIRect ir               = path.getBounds().roundOut();
+    const SkIRect& clipRect  = clipRgn.getBounds();
+    int  count               = builder.build_edges(path, &clipRect, 0, pathContainedInClip,
+                                                   SkEdgeBuilder::kBezier);
+    if (count == 0) {
+        return;
+    }
+    SkBezier** list = builder.bezierList();
+
+    // 2. Try to find the rect part because blitAntiRect is so much faster than blitCoverageDeltas
+    int rectTop = ir.fBottom;   // the rect is initialized to be empty as top = bot
+    int rectBot = ir.fBottom;
+    if (skipRect) {             // only find that rect is skipRect == true
+        YLessThan lessThan;     // sort edges in YX order
+        SkTQSort(list, list + count - 1, lessThan);
+        for(int i = 0; i < count - 1; ++i) {
+            SkBezier* lb = list[i];
+            SkBezier* rb = list[i + 1];
+
+            bool lDX0 = lb->fP0.fX == lb->fP1.fX;
+            bool rDX0 = rb->fP0.fX == rb->fP1.fX;
+            if (!lDX0 || !rDX0) { // make sure that the edges are vertical
+                continue;
+            }
+
+            SkAnalyticEdge l, r;
+            l.setLine(lb->fP0, lb->fP1);
+            r.setLine(rb->fP0, rb->fP1);
+
+            SkFixed xorUpperY = l.fUpperY ^ r.fUpperY;
+            SkFixed xorLowerY = l.fLowerY ^ r.fLowerY;
+            if ((xorUpperY | xorLowerY) == 0) { // equal upperY and lowerY
+                rectTop = SkFixedCeilToInt(l.fUpperY);
+                rectBot = SkFixedFloorToInt(l.fLowerY);
+                if (rectBot > rectTop) { // if bot == top, the rect is too short for blitAntiRect
+                    int L = SkFixedCeilToInt(l.fUpperX);
+                    int R = SkFixedFloorToInt(r.fUpperX);
+                    if (L <= R) {
+                        SkAlpha la = (SkIntToFixed(L) - l.fUpperX) >> 8;
+                        SkAlpha ra = (r.fUpperX - SkIntToFixed(R)) >> 8;
+                        result.setAntiRect(L - 1, rectTop, R - L, rectBot - rectTop, la, ra);
+                    } else { // too thin to use blitAntiRect; reset the rect region to be emtpy
+                        rectTop = rectBot = ir.fBottom;
+                    }
+                }
+                break;
+            }
+
+        }
+    }
+
+    // 3. Sort edges in x so we may need less sorting for delta based on x. This only helps
+    //    SkCoverageDeltaList. And we don't want to sort more than SORT_THRESHOLD edges where
+    //    the log(count) factor of the quick sort may become a bottleneck; when there are so
+    //    many edges, we're unlikely to make deltas sorted anyway.
+    constexpr int SORT_THRESHOLD = 4096;
+    if (std::is_same<Deltas, SkCoverageDeltaList>::value && count < SORT_THRESHOLD) {
+        XLessThan lessThan;
+        SkTQSort(list, list + count - 1, lessThan);
+    }
+
+    // Future todo: parallize and SIMD the following code.
+    // 4. iterate through edges and generate deltas
+    for(int index = 0; index < count; ++index) {
+        SkAnalyticCubicEdge storage;
+        SkASSERT(sizeof(SkAnalyticQuadraticEdge) >= sizeof(SkAnalyticEdge));
+        SkASSERT(sizeof(SkAnalyticCubicEdge) >= sizeof(SkAnalyticQuadraticEdge));
+
+        SkBezier* bezier        = list[index];
+        SkAnalyticEdge* currE   = &storage;
+        bool edgeSet            = false;
+
+        switch (bezier->fCount) {
+            case 2: {
+                edgeSet = currE->setLine(bezier->fP0, bezier->fP1);
+                break;
+            }
+            case 3: {
+                SkQuad* quad = static_cast<SkQuad*>(bezier);
+                SkPoint pts[3] = {quad->fP0, quad->fP1, quad->fP2};
+                edgeSet = static_cast<SkAnalyticQuadraticEdge*>(currE)->setQuadratic(pts);
+                break;
+            }
+            case 4: {
+                SkCubic* cubic = static_cast<SkCubic*>(bezier);
+                SkPoint pts[4] = {cubic->fP0, cubic->fP1, cubic->fP2, cubic->fP3};
+                edgeSet = static_cast<SkAnalyticCubicEdge*>(currE)->setCubic(pts);
+                break;
+            }
+        }
+
+        if (!edgeSet) {
+            continue;
+        }
+
+        do {
+            currE->fX =  currE->fUpperX;
+
+            SkFixed upperFloor  = SkFixedFloorToFixed(currE->fUpperY);
+            SkFixed lowerCeil   = SkFixedCeilToFixed(currE->fLowerY);
+            int     iy          = SkFixedFloorToInt(upperFloor);
+
+            if (lowerCeil <= upperFloor + SK_Fixed1) { // only one row is affected by the currE
+                SkFixed rowHeight = currE->fLowerY - currE->fUpperY;
+                SkFixed nextX = currE->fX + SkFixedMul(currE->fDX, rowHeight);
+                add_coverage_delta_segment<true>(iy, rowHeight, currE, nextX, &result);
+                continue;
+            }
+
+            // check first row
+            SkFixed rowHeight = upperFloor + SK_Fixed1 - currE->fUpperY;
+            SkFixed nextX;
+            if (rowHeight != SK_Fixed1) {   // it's a partial row
+                nextX = currE->fX + SkFixedMul(currE->fDX, rowHeight);
+                add_coverage_delta_segment<true>(iy, rowHeight, currE, nextX, &result);
+            } else {                        // it's a full row so we can leave it to the while loop
+                iy--;                       // compensate the iy++ in the while loop
+                nextX = currE->fX;
+            }
+
+            while (true) { // process the full rows in the middle
+                iy++;
+                SkFixed y = SkIntToFixed(iy);
+                currE->fX = nextX;
+                nextX += currE->fDX;
+
+                if (y + SK_Fixed1 > currE->fLowerY) {
+                    break; // no full rows left, break
+                }
+
+                // Check whether we're in the rect part that will be covered by blitAntiRect
+                if (iy >= rectTop && iy < rectBot) {
+                    SkASSERT(currE->fDX == 0);  // If yes, we must be on an edge with fDX = 0.
+                    iy = rectBot - 1;           // Skip the rect part by advancing iy to the bottom.
+                    continue;
+                }
+
+                // Add current edge's coverage deltas on this full row
+                add_coverage_delta_segment<false>(iy, SK_Fixed1, currE, nextX, &result);
+            }
+
+            // last partial row
+            if (SkIntToFixed(iy) < currE->fLowerY) {
+                rowHeight = currE->fLowerY - SkIntToFixed(iy);
+                nextX = currE->fX + SkFixedMul(currE->fDX, rowHeight);
+                add_coverage_delta_segment<true>(iy, rowHeight, currE, nextX, &result);
+            }
+        } while (currE->update(currE->fLowerY));
+    }
+}
+
+void SkScan::DAAFillPath(const SkPath& path, const SkRegion& origClip, SkBlitter* blitter,
+                         bool forceRLE) {
+
+    FillPathFunc fillPathFunc = [](const SkPath& path, SkBlitter* blitter, bool isInverse,
+            const SkIRect& ir, const SkRegion* clipRgn, const SkIRect* clipRect, bool forceRLE){
+        bool isEvenOdd  = path.getFillType() & 1;
+        bool isConvex   = path.isConvex();
+        bool skipRect   = isConvex && !isInverse;
+
+        const SkIRect& clipBounds = clipRgn->getBounds();
+        SkIRect clippedIR = ir;
+        clippedIR.intersect(clipBounds);
+
+        SkRect rect;
+        if (!isInverse && path.isRect(&rect) && clippedIR.height() >= 3 && clippedIR.width() >= 3) {
+            // The overhead of even constructing SkCoverageDeltaList/Mask is too big. So just blit.
+            bool nonEmpty = rect.intersect(SkRect::MakeFromIRect(clipBounds));
+            SkASSERT(nonEmpty); // do_fill_path should have already handled the empty case
+            if (nonEmpty) {
+                blitter->blitFatAntiRect(rect);
+            }
+            return;
+        }
+
+        // Only blitter->blitXXX need to be done in order in the threaded backend.
+        // Everything before can be done out of order in the threaded backend.
+        if (!forceRLE && !isInverse && SkCoverageDeltaMask::Suitable(clippedIR)) {
+            SkCoverageDeltaMask deltaMask(clippedIR);
+            gen_alpha_deltas(path, *clipRgn, deltaMask, blitter, skipRect, clipRect == nullptr);
+            deltaMask.convertCoverageToAlpha(isEvenOdd, isInverse, isConvex);
+            blitter->blitMask(deltaMask.prepareSkMask(), clippedIR);
+        } else {
+            SkCoverageDeltaAllocator alloc;
+            SkCoverageDeltaList deltaList(&alloc, clippedIR.fTop, clippedIR.fBottom, forceRLE);
+            gen_alpha_deltas(path, *clipRgn, deltaList, blitter, skipRect, clipRect == nullptr);
+            blitter->blitCoverageDeltas(&deltaList, clipBounds, isEvenOdd, isInverse, isConvex);
+        }
+    };
+
+    // For threaded backend with out-of-order init-once (and therefore out-of-order do_fill_path),
+    // we probably have to take care of the blitRegion, sk_blit_above, sk_blit_below in do_fill_path
+    // to maintain the draw order. If we do that, be caureful that blitRect may throw exception is
+    // the rect is empty.
+    do_fill_path(path, origClip, blitter, forceRLE, 2, std::move(fillPathFunc));
+}
diff --git a/tools/flags/SkCommonFlags.cpp b/tools/flags/SkCommonFlags.cpp
index 0148bcd..19afd10 100644
--- a/tools/flags/SkCommonFlags.cpp
+++ b/tools/flags/SkCommonFlags.cpp
@@ -72,6 +72,11 @@
                                     "whether it's concave or convex, we consider a path complicated"
                                     "if its number of points is comparable to its resolution.");
 
+DEFINE_bool(deltaAA, false,
+            "If true, use delta anti-aliasing in suitable cases (it overrides forceAnalyticAA.");
+
+DEFINE_bool(forceDeltaAA, false, "Force delta anti-aliasing for all paths.");
+
 bool CollectImages(SkCommandLineFlags::StringArray images, SkTArray<SkString>* output) {
     SkASSERT(output);
 
diff --git a/tools/flags/SkCommonFlags.h b/tools/flags/SkCommonFlags.h
index 92ac141..d35e9a8 100644
--- a/tools/flags/SkCommonFlags.h
+++ b/tools/flags/SkCommonFlags.h
@@ -34,6 +34,8 @@
 DECLARE_bool(pre_log);
 DECLARE_bool(analyticAA);
 DECLARE_bool(forceAnalyticAA);
+DECLARE_bool(deltaAA);
+DECLARE_bool(forceDeltaAA);
 
 DECLARE_string(key);
 DECLARE_string(properties);