| /* |
| * Copyright 2024 Google LLC |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "src/core/SkBlurEngine.h" |
| |
| #include "include/core/SkAlphaType.h" |
| #include "include/core/SkBitmap.h" |
| #include "include/core/SkBlendMode.h" |
| #include "include/core/SkClipOp.h" |
| #include "include/core/SkColor.h" |
| #include "include/core/SkColorSpace.h" // IWYU pragma: keep |
| #include "include/core/SkColorType.h" |
| #include "include/core/SkImageInfo.h" |
| #include "include/core/SkM44.h" |
| #include "include/core/SkMatrix.h" |
| #include "include/core/SkPaint.h" |
| #include "include/core/SkPoint.h" |
| #include "include/core/SkRect.h" |
| #include "include/core/SkSamplingOptions.h" |
| #include "include/core/SkScalar.h" |
| #include "include/core/SkSurfaceProps.h" |
| #include "include/core/SkTileMode.h" |
| #include "include/effects/SkRuntimeEffect.h" |
| #include "include/private/base/SkAssert.h" |
| #include "include/private/base/SkFeatures.h" |
| #include "include/private/base/SkMalloc.h" |
| #include "include/private/base/SkMath.h" |
| #include "include/private/base/SkTo.h" |
| #include "src/base/SkArenaAlloc.h" |
| #include "src/base/SkVx.h" |
| #include "src/core/SkBitmapDevice.h" |
| #include "src/core/SkDevice.h" |
| #include "src/core/SkKnownRuntimeEffects.h" |
| #include "src/core/SkSpecialImage.h" |
| |
| #include <algorithm> |
| #include <array> |
| #include <cmath> |
| #include <cstdint> |
| #include <cstring> |
| #include <utility> |
| |
| |
| #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 |
| #include <xmmintrin.h> |
| #define SK_PREFETCH(ptr) _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0) |
| #elif defined(__GNUC__) |
| #define SK_PREFETCH(ptr) __builtin_prefetch(ptr) |
| #else |
| #define SK_PREFETCH(ptr) |
| #endif |
| |
| // RasterBlurEngine |
| // ---------------------------------------------------------------------------- |
| |
| namespace { |
| |
| class Pass { |
| public: |
| explicit Pass(int border) : fBorder(border) {} |
| virtual ~Pass() = default; |
| |
| // T is type of the pixel format for the color type. |
| template <typename T> |
| void blur(int srcLeft, int srcRight, int dstRight, |
| const T* src, int srcStride, |
| T* dst, int dstStride) { |
| this->startBlur(); |
| |
| auto srcStart = srcLeft - fBorder, |
| srcEnd = srcRight - fBorder, |
| dstEnd = dstRight, |
| srcIdx = srcStart, |
| dstIdx = 0; |
| |
| const T* srcCursor = src; |
| T* dstCursor = dst; |
| |
| if (dstIdx < srcIdx) { |
| // The destination pixels are not effected by the src pixels, |
| // change to zero as per the spec. |
| // https://drafts.fxtf.org/filter-effects/#FilterPrimitivesOverviewIntro |
| int commonEnd = std::min(srcIdx, dstEnd); |
| while (dstIdx < commonEnd) { |
| *dstCursor = 0; |
| dstCursor += dstStride; |
| SK_PREFETCH(dstCursor); |
| dstIdx++; |
| } |
| } else if (srcIdx < dstIdx) { |
| // The edge of the source is before the edge of the destination. Calculate the sums for |
| // the pixels before the start of the destination. |
| if (int commonEnd = std::min(dstIdx, srcEnd); srcIdx < commonEnd) { |
| // Preload the blur with values from src before dst is entered. |
| int n = commonEnd - srcIdx; |
| this->blurSegment(n, srcCursor, srcStride, nullptr, 0); |
| srcIdx += n; |
| srcCursor += n * srcStride; |
| } |
| if (srcIdx < dstIdx) { |
| // The weird case where src is out of pixels before dst is even started. |
| int n = dstIdx - srcIdx; |
| this->blurSegment(n, nullptr, 0, nullptr, 0); |
| srcIdx += n; |
| } |
| } |
| |
| if (int commonEnd = std::min(dstEnd, srcEnd); dstIdx < commonEnd) { |
| // Both srcIdx and dstIdx are in sync now, and can run in a 1:1 fashion. This is the |
| // normal mode of operation. |
| SkASSERT(srcIdx == dstIdx); |
| |
| int n = commonEnd - dstIdx; |
| this->blurSegment(n, srcCursor, srcStride, dstCursor, dstStride); |
| srcCursor += n * srcStride; |
| dstCursor += n * dstStride; |
| dstIdx += n; |
| srcIdx += n; |
| } |
| |
| // Drain the remaining blur values into dst assuming 0's for the leading edge. |
| if (dstIdx < dstEnd) { |
| int n = dstEnd - dstIdx; |
| this->blurSegment(n, nullptr, 0, dstCursor, dstStride); |
| } |
| } |
| |
| protected: |
| virtual void startBlur() = 0; |
| virtual void blurSegment( |
| int n, const void* src, int srcStride, void* dst, int dstStride) = 0; |
| |
| private: |
| const int fBorder; |
| }; |
| |
| class PassMaker { |
| public: |
| explicit PassMaker(int window, float sigma) : fWindow{window}, |
| fSigma{sigma} {} |
| virtual ~PassMaker() = default; |
| virtual Pass* makePass(void* buffer, SkArenaAlloc* alloc) const = 0; |
| virtual size_t bufferSizeBytes() const = 0; |
| int window() const {return fWindow;} |
| float sigma() const {return fSigma;} |
| |
| private: |
| const int fWindow; |
| const float fSigma; |
| }; |
| |
| // T is type of the pixel format for the color type. |
| // This should only be used for 8bit color channels. |
| template <typename T> |
| static sk_sp<SkSpecialImage> eval_blur_passes(PassMaker* makerX, PassMaker* makerY, |
| SkBitmap src, const SkIRect& originalSrcBounds, |
| const SkIRect& originalDstBounds, |
| SkArenaAlloc* alloc) { |
| static constexpr int N = sizeof(T) / sizeof(uint8_t); |
| static_assert(N*sizeof(uint8_t) == sizeof(T), "N must be the the size of T in bytes."); |
| |
| SkIRect srcBounds = originalSrcBounds; |
| SkIRect dstBounds = originalDstBounds; |
| if (makerX->window() > 1) { |
| // Inflate the dst by the window required for the Y pass so that the X pass can prepare |
| // it. The Y pass will be offset to only write to the original rows in dstBounds, but |
| // its window will access these extra rows calculated by the X pass. The SpecialImage |
| // factory will then subset the bitmap so it appears to match 'originalDstBounds' |
| // tightly. We make one slightly larger image to hold this extra data instead of two |
| // separate images sized exactly to each pass because the CPU blur can write in place. |
| dstBounds.outset(0, SkBlurEngine::SigmaToRadius(makerY->sigma())); |
| } |
| |
| SkBitmap dst; |
| const SkIPoint dstOrigin = dstBounds.topLeft(); |
| if (!dst.tryAllocPixels(src.info().makeWH(dstBounds.width(), dstBounds.height()))) { |
| return nullptr; |
| } |
| dst.eraseColor(SK_ColorTRANSPARENT); |
| |
| auto buffer = alloc->makeBytesAlignedTo(std::max(makerX->bufferSizeBytes(), |
| makerY->bufferSizeBytes()), |
| alignof(skvx::Vec<N, uint32_t>)); |
| |
| // Basic Plan: The three cases to handle |
| // * Horizontal and Vertical - blur horizontally while copying values from the source to |
| // the destination. Then, do an in-place vertical blur. |
| // * Horizontal only - blur horizontally copying values from the source to the destination. |
| // * Vertical only - blur vertically copying values from the source to the destination. |
| |
| // Initialize these assuming the Y-only case |
| int loopStart = std::max(srcBounds.left(), dstBounds.left()); |
| int loopEnd = std::min(srcBounds.right(), dstBounds.right()); |
| int dstYOffset = 0; |
| |
| if (makerX->window() > 1) { |
| // First an X-only blur from src into dst, including the extra rows that will become |
| // input for the second Y pass, which will then be performed in place. |
| loopStart = std::max(srcBounds.top(), dstBounds.top()); |
| loopEnd = std::min(srcBounds.bottom(), dstBounds.bottom()); |
| |
| auto srcAddr = reinterpret_cast<T*>(src.getAddr(0, loopStart - srcBounds.top())); |
| auto dstAddr = reinterpret_cast<T*>(dst.getAddr(0, loopStart - dstBounds.top())); |
| |
| // Iterate over each row to calculate 1D blur along X. |
| Pass* pass = makerX->makePass(buffer, alloc); |
| for (int y = loopStart; y < loopEnd; ++y) { |
| pass->blur<T>(srcBounds.left() - dstBounds.left(), |
| srcBounds.right() - dstBounds.left(), |
| dstBounds.width(), |
| srcAddr, 1, |
| dstAddr, 1); |
| srcAddr += src.rowBytesAsPixels(); |
| dstAddr += dst.rowBytesAsPixels(); |
| } |
| |
| // Set up the Y pass to blur from the full dst into the non-outset portion of dst |
| src = dst; |
| loopStart = originalDstBounds.left(); |
| loopEnd = originalDstBounds.right(); |
| // The new 'dst' is equal to dst.extractSubset(originalDstBounds.offset(-dstOrigin)), |
| // but by construction only the Y offset has an interesting value so this is a little |
| // more efficient. |
| dstYOffset = originalDstBounds.top() - dstBounds.top(); |
| |
| srcBounds = dstBounds; |
| dstBounds = originalDstBounds; |
| } |
| |
| // Iterate over each column to calculate 1D blur along Y. This is either blurring from src |
| // into dst for a 1D blur; or it's blurring from dst into dst for the second pass of a 2D |
| // blur. |
| if (makerY->window() > 1) { |
| auto srcAddr = reinterpret_cast<T*>(src.getAddr(loopStart - srcBounds.left(), 0)); |
| auto dstAddr = reinterpret_cast<T*>(dst.getAddr(loopStart - dstBounds.left(), dstYOffset)); |
| |
| Pass* pass = makerY->makePass(buffer, alloc); |
| for (int x = loopStart; x < loopEnd; ++x) { |
| pass->blur<T>(srcBounds.top() - dstBounds.top(), |
| srcBounds.bottom() - dstBounds.top(), |
| dstBounds.height(), |
| srcAddr, src.rowBytesAsPixels(), |
| dstAddr, dst.rowBytesAsPixels()); |
| srcAddr += 1; |
| dstAddr += 1; |
| } |
| } |
| |
| #if defined(SK_AVOID_SLOW_RASTER_PIPELINE_BLURS) |
| // When avoiding the shader-based algorithm, handle the box identity case. |
| if (makerX->window() == 1 && makerY->window() == 1) { |
| dst.writePixels(src.pixmap(), |
| srcBounds.left() - dstBounds.left(), |
| srcBounds.top() - dstBounds.top()); |
| } |
| #endif //SK_AVOID_SLOW_RASTER_PIPELINE_BLURS |
| |
| dstBounds = originalDstBounds.makeOffset(-dstOrigin); // Make relative to dst's pixels |
| return SkSpecialImages::MakeFromRaster(dstBounds, dst, SkSurfaceProps{}); |
| } |
| |
| // Implement a scanline processor for a true 1D Gaussian kernel. |
| // T is type of the pixel format for the color type. |
| // This should only be used for 8bit color channels. |
| template <typename T> |
| class GaussianPass final : public Pass { |
| public: |
| static constexpr int N = sizeof(T) / sizeof(uint8_t); |
| static_assert(N*sizeof(uint8_t) == sizeof(T), "N must be the the size of T in bytes."); |
| |
| static constexpr float kMaxSigma = 2.f; |
| |
| static PassMaker* MakeMaker(float sigma, SkArenaAlloc* alloc) { |
| if (sigma >= kMaxSigma) { return nullptr; } |
| |
| class Maker : public PassMaker { |
| public: |
| explicit Maker(float sigma) |
| : PassMaker{2 * SkBlurEngine::SigmaToRadius(sigma) + 1, sigma} {} |
| Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override { |
| return GaussianPass::Make(this->sigma(), buffer, alloc); |
| } |
| size_t bufferSizeBytes() const override { |
| // Data is skvx::Vec<N, float>[window] + float[window] |
| return this->window() * (sizeof(skvx::Vec<N, float>) + sizeof(float)); |
| |
| } |
| }; |
| |
| return alloc->make<Maker>(sigma); |
| } |
| |
| static GaussianPass* Make(float sigma, void* buffers, SkArenaAlloc* alloc) { |
| int radius = SkBlurEngine::SigmaToRadius(sigma); |
| int kernelWidth = 2*radius + 1; |
| |
| skvx::Vec<N, float>* srcBuffer = static_cast<skvx::Vec<N, float>*>(buffers); |
| |
| float* kernelValues = reinterpret_cast<float*>(srcBuffer + kernelWidth); |
| SkShaderBlurAlgorithm::Compute1DBlurKernel(sigma, radius, {kernelValues, kernelWidth}); |
| |
| return alloc->make<GaussianPass>(radius, kernelValues, srcBuffer); |
| } |
| |
| |
| GaussianPass(int radius, float* kernel, skvx::Vec<N, float>* srcBuffer) |
| : Pass(radius), |
| fWindow(2 * radius + 1), |
| fKernel(kernel), |
| fSrcBuffer(srcBuffer), |
| fSrcBufferBase(0) {} |
| |
| private: |
| void startBlur() override { |
| // Zero out the source buffer to ensure a clean state. |
| sk_bzero(fSrcBuffer, fWindow * sizeof(skvx::Vec<N, float>)); |
| // Reset the circular buffer's starting position. |
| fSrcBufferBase = 0; |
| } |
| |
| void blurSegment(int n, const void* src, int srcStride, void* dst, int dstStride) override { |
| const T* srcPtr = reinterpret_cast<const T*>(src); |
| T* dstPtr = reinterpret_cast<T*>(dst); |
| |
| // Load the state from the last run. |
| int base = fSrcBufferBase; |
| |
| auto convolve = [this](int srcBase) { |
| skvx::Vec<N, float> sum = 0.f; |
| for (int i = 0; i < fWindow; ++i) { |
| int s = (i + srcBase) % fWindow; |
| sum += fSrcBuffer[s] * fKernel[i]; |
| } |
| return skvx::cast<uint8_t>(skvx::pin(sum * 255.f + 0.5f, |
| skvx::Vec<N, float>(0.f), |
| skvx::Vec<N, float>(255.f))); |
| }; |
| |
| while (n-- > 0) { |
| skvx::Vec<N, float> leadingEdge = srcPtr |
| ? skvx::cast<float>(skvx::Vec<N, uint8_t>::Load(srcPtr)) * (1 / 255.0f) |
| : skvx::Vec<N, float>(0.f); |
| |
| // Load the new leading edge into the circular buffer. |
| fSrcBuffer[(base + fWindow - 1) % fWindow] = leadingEdge; |
| |
| // Perform the convolution and store the result. |
| if (dstPtr) { |
| convolve(base).store(dstPtr); |
| dstPtr += dstStride; |
| } |
| |
| // Advance the source pointer (if it exists) and the circular buffer base. |
| if (srcPtr) { |
| srcPtr += srcStride; |
| } |
| base = (base + 1) % fWindow; |
| } |
| |
| fSrcBufferBase = base; |
| } |
| |
| const int fWindow; |
| float* fKernel; |
| skvx::Vec<N, float>* fSrcBuffer; |
| int fSrcBufferBase; |
| }; |
| |
| |
| // Implement a scanline processor that uses a three-box filter to approximate a Gaussian blur. |
| // The ThreeBoxApproxPass is limit to processing sigmas < 135. |
| class ThreeBoxApproxPass final : public Pass { |
| public: |
| // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow |
| // using the Gauss filter. It also limits the size of buffers used hold intermediate values. |
| // Explanation of maximums: |
| // sum0 = window * 255 |
| // sum1 = window * sum0 -> window * window * 255 |
| // sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255 |
| // |
| // The value window^3 * 255 must fit in a uint32_t. So, |
| // window^3 < 2^32. window = 255. |
| // |
| // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5) |
| // For window <= 255, the largest value for sigma is 136. |
| static PassMaker* MakeMaker(float sigma, SkArenaAlloc* alloc) { |
| SkASSERT(0 <= sigma); |
| int window = SkBlurEngine::BoxBlurWindow(sigma); |
| if (255 <= window) { |
| return nullptr; |
| } |
| |
| class Maker : public PassMaker { |
| public: |
| explicit Maker(int window, float sigma) : PassMaker{window, sigma} {} |
| Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override { |
| return ThreeBoxApproxPass::Make(this->window(), buffer, alloc); |
| } |
| |
| size_t bufferSizeBytes() const override { |
| int window = this->window(); |
| size_t onePassSize = window - 1; |
| // If the window is odd, then there is an obvious middle element. For even sizes |
| // 2 passes are shifted, and the last pass has an extra element. Like this: |
| // S |
| // aaaAaa |
| // bbBbbb |
| // cccCccc |
| // D |
| size_t bufferCount = (window & 1) == 1 ? 3 * onePassSize : 3 * onePassSize + 1; |
| return bufferCount * sizeof(skvx::Vec<4, uint32_t>); |
| } |
| }; |
| |
| return alloc->make<Maker>(window, sigma); |
| } |
| |
| static ThreeBoxApproxPass* Make(int window, void* buffers, SkArenaAlloc* alloc) { |
| // We don't need to store the trailing edge pixel in the buffer; |
| int passSize = window - 1; |
| skvx::Vec<4, uint32_t>* buffer0 = static_cast<skvx::Vec<4, uint32_t>*>(buffers); |
| skvx::Vec<4, uint32_t>* buffer1 = buffer0 + passSize; |
| skvx::Vec<4, uint32_t>* buffer2 = buffer1 + passSize; |
| // If the window is odd just one buffer is needed, but if it's even, then there is one |
| // more element on that pass. |
| skvx::Vec<4, uint32_t>* buffersEnd = buffer2 + ((window & 1) ? passSize : passSize + 1); |
| |
| // Calculating the border is tricky. The border is the distance in pixels between the first |
| // dst pixel and the first src pixel (or the last src pixel and the last dst pixel). |
| // I will go through the odd case which is simpler, and then through the even case. Given a |
| // stack of filters seven wide for the odd case of three passes. |
| // |
| // S |
| // aaaAaaa |
| // bbbBbbb |
| // cccCccc |
| // D |
| // |
| // The furthest changed pixel is when the filters are in the following configuration. |
| // |
| // S |
| // aaaAaaa |
| // bbbBbbb |
| // cccCccc |
| // D |
| // |
| // The A pixel is calculated using the value S, the B uses A, and the C uses B, and |
| // finally D is C. So, with a window size of seven the border is nine. In the odd case, the |
| // border is 3*((window - 1)/2). |
| // |
| // For even cases the filter stack is more complicated. The spec specifies two passes |
| // of even filters and a final pass of odd filters. A stack for a width of six looks like |
| // this. |
| // |
| // S |
| // aaaAaa |
| // bbBbbb |
| // cccCccc |
| // D |
| // |
| // The furthest pixel looks like this. |
| // |
| // S |
| // aaaAaa |
| // bbBbbb |
| // cccCccc |
| // D |
| // |
| // For a window of six, the border value is eight. In the even case the border is 3 * |
| // (window/2) - 1. |
| int border = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1; |
| |
| // If the window is odd then the divisor is just window ^ 3 otherwise, |
| // it is window * window * (window + 1) = window ^ 3 + window ^ 2; |
| int window2 = window * window; |
| int window3 = window2 * window; |
| int divisor = (window & 1) == 1 ? window3 : window3 + window2; |
| return alloc->make<ThreeBoxApproxPass>(buffer0, buffer1, buffer2, |
| buffersEnd, border, divisor); |
| } |
| |
| ThreeBoxApproxPass(skvx::Vec<4, uint32_t>* buffer0, |
| skvx::Vec<4, uint32_t>* buffer1, |
| skvx::Vec<4, uint32_t>* buffer2, |
| skvx::Vec<4, uint32_t>* buffersEnd, |
| int border, |
| int divisor) |
| : Pass{border} |
| , fBuffer0{buffer0} |
| , fBuffer1{buffer1} |
| , fBuffer2{buffer2} |
| , fBuffersEnd{buffersEnd} |
| , fDivider(divisor) {} |
| |
| private: |
| void startBlur() override { |
| skvx::Vec<4, uint32_t> zero = {0u, 0u, 0u, 0u}; |
| zero.store(fSum0); |
| zero.store(fSum1); |
| auto half = fDivider.half(); |
| skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum2); |
| sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>)); |
| |
| fBuffer0Cursor = fBuffer0; |
| fBuffer1Cursor = fBuffer1; |
| fBuffer2Cursor = fBuffer2; |
| } |
| |
| // GaussPass implements the common three pass box filter approximation of Gaussian blur, |
| // but combines all three passes into a single pass. This approach is facilitated by three |
| // circular buffers the width of the window which track values for trailing edges of each of |
| // the three passes. This allows the algorithm to use more precision in the calculation |
| // because the values are not rounded each pass. And this implementation also avoids a trap |
| // that's easy to fall into resulting in blending in too many zeroes near the edge. |
| // |
| // In general, a window sum has the form: |
| // sum_n+1 = sum_n + leading_edge - trailing_edge. |
| // If instead we do the subtraction at the end of the previous iteration, we can just |
| // calculate the sums instead of having to do the subtractions too. |
| // |
| // In previous iteration: |
| // sum_n+1 = sum_n - trailing_edge. |
| // |
| // In this iteration: |
| // sum_n+1 = sum_n + leading_edge. |
| // |
| // Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the |
| // actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the |
| // three passes at the same time has the form: |
| // |
| // sum0_n+1 = sum0_n + leading edge |
| // sum1_n+1 = sum1_n + sum0_n+1 |
| // sum2_n+1 = sum2_n + sum1_n+1 |
| // |
| // sum2_n+1 / window^3 is the new value of the destination pixel. |
| // |
| // Reduce the sums by the trailing edges which were stored in the circular buffers for the |
| // next go around. This is the case for odd sized windows, even windows the the third |
| // circular buffer is one larger then the first two circular buffers. |
| // |
| // sum2_n+2 = sum2_n+1 - buffer2[i]; |
| // buffer2[i] = sum1; |
| // sum1_n+2 = sum1_n+1 - buffer1[i]; |
| // buffer1[i] = sum0; |
| // sum0_n+2 = sum0_n+1 - buffer0[i]; |
| // buffer0[i] = leading edge |
| void blurSegment( |
| int n, const void* src, int srcStride, void* dst, int dstStride) override { |
| const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src); |
| uint32_t* dst32 = reinterpret_cast<uint32_t*>(dst); |
| #if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX |
| skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor; |
| skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor; |
| skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor; |
| v4u32 sum0 = __lsx_vld(fSum0, 0); // same as skvx::Vec<4, uint32_t>::Load(fSum0); |
| v4u32 sum1 = __lsx_vld(fSum1, 0); |
| v4u32 sum2 = __lsx_vld(fSum2, 0); |
| |
| auto processValue = [&](v4u32& vLeadingEdge){ |
| sum0 += vLeadingEdge; |
| sum1 += sum0; |
| sum2 += sum1; |
| |
| v4u32 divisorFactor = __lsx_vreplgr2vr_w(fDivider.divisorFactor()); |
| v4u32 blurred = __lsx_vmuh_w(divisorFactor, sum2); |
| |
| v4u32 buffer2Value = __lsx_vld(buffer2Cursor, 0); //Not fBuffer0Cursor, out of bounds. |
| sum2 -= buffer2Value; |
| __lsx_vst(sum1, (void *)buffer2Cursor, 0); |
| buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2; |
| v4u32 buffer1Value = __lsx_vld(buffer1Cursor, 0); |
| sum1 -= buffer1Value; |
| __lsx_vst(sum0, (void *)buffer1Cursor, 0); |
| buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1; |
| v4u32 buffer0Value = __lsx_vld(buffer0Cursor, 0); |
| sum0 -= buffer0Value; |
| __lsx_vst(vLeadingEdge, (void *)buffer0Cursor, 0); |
| buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0; |
| |
| v16u8 shuf = {0x0,0x4,0x8,0xc,0x0}; |
| v16u8 ret = __lsx_vshuf_b(blurred, blurred, shuf); |
| return ret; |
| }; |
| |
| v4u32 zero = __lsx_vldi(0x0); |
| if (!src32 && !dst32) { |
| while (n --> 0) { |
| (void)processValue(zero); |
| } |
| } else if (src32 && !dst32) { |
| while (n --> 0) { |
| v4u32 edge = __lsx_vinsgr2vr_w(zero, *src32, 0); |
| edge = __lsx_vilvl_b(zero, edge); |
| edge = __lsx_vilvl_h(zero, edge); |
| (void)processValue(edge); |
| src32 += srcStride; |
| } |
| } else if (!src32 && dst32) { |
| while (n --> 0) { |
| v4u32 ret = processValue(zero); |
| __lsx_vstelm_w(ret, dst32, 0, 0); // 3rd is offset, 4th is idx. |
| dst32 += dstStride; |
| } |
| } else if (src32 && dst32) { |
| while (n --> 0) { |
| v4u32 edge = __lsx_vinsgr2vr_w(zero, *src32, 0); |
| edge = __lsx_vilvl_b(zero, edge); |
| edge = __lsx_vilvl_h(zero, edge); |
| v4u32 ret = processValue(edge); |
| __lsx_vstelm_w(ret, dst32, 0, 0); |
| src32 += srcStride; |
| dst32 += dstStride; |
| } |
| } |
| |
| // Store the state |
| fBuffer0Cursor = buffer0Cursor; |
| fBuffer1Cursor = buffer1Cursor; |
| fBuffer2Cursor = buffer2Cursor; |
| |
| __lsx_vst(sum0, fSum0, 0); |
| __lsx_vst(sum1, fSum1, 0); |
| __lsx_vst(sum2, fSum2, 0); |
| #else |
| skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor; |
| skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor; |
| skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor; |
| skvx::Vec<4, uint32_t> sum0 = skvx::Vec<4, uint32_t>::Load(fSum0); |
| skvx::Vec<4, uint32_t> sum1 = skvx::Vec<4, uint32_t>::Load(fSum1); |
| skvx::Vec<4, uint32_t> sum2 = skvx::Vec<4, uint32_t>::Load(fSum2); |
| |
| // Given an expanded input pixel, move the window ahead using the leadingEdge value. |
| auto processValue = [&](const skvx::Vec<4, uint32_t>& leadingEdge) { |
| sum0 += leadingEdge; |
| sum1 += sum0; |
| sum2 += sum1; |
| |
| skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum2); |
| |
| sum2 -= *buffer2Cursor; |
| *buffer2Cursor = sum1; |
| buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2; |
| sum1 -= *buffer1Cursor; |
| *buffer1Cursor = sum0; |
| buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1; |
| sum0 -= *buffer0Cursor; |
| *buffer0Cursor = leadingEdge; |
| buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0; |
| |
| return skvx::cast<uint8_t>(blurred); |
| }; |
| |
| auto loadEdge = [&](const uint32_t* srcCursor) { |
| return skvx::cast<uint32_t>(skvx::Vec<4, uint8_t>::Load(srcCursor)); |
| }; |
| |
| if (!src32 && !dst32) { |
| while (n --> 0) { |
| (void)processValue(0); |
| } |
| } else if (src32 && !dst32) { |
| while (n --> 0) { |
| (void)processValue(loadEdge(src32)); |
| src32 += srcStride; |
| } |
| } else if (!src32 && dst32) { |
| while (n --> 0) { |
| processValue(0u).store(dst32); |
| dst32 += dstStride; |
| } |
| } else if (src32 && dst32) { |
| while (n --> 0) { |
| processValue(loadEdge(src32)).store(dst32); |
| src32 += srcStride; |
| dst32 += dstStride; |
| } |
| } |
| |
| // Store the state |
| fBuffer0Cursor = buffer0Cursor; |
| fBuffer1Cursor = buffer1Cursor; |
| fBuffer2Cursor = buffer2Cursor; |
| |
| sum0.store(fSum0); |
| sum1.store(fSum1); |
| sum2.store(fSum2); |
| #endif |
| } |
| |
| skvx::Vec<4, uint32_t>* const fBuffer0; |
| skvx::Vec<4, uint32_t>* const fBuffer1; |
| skvx::Vec<4, uint32_t>* const fBuffer2; |
| skvx::Vec<4, uint32_t>* const fBuffersEnd; |
| const skvx::ScaledDividerU32 fDivider; |
| |
| // blur state |
| char fSum0[sizeof(skvx::Vec<4, uint32_t>)]; |
| char fSum1[sizeof(skvx::Vec<4, uint32_t>)]; |
| char fSum2[sizeof(skvx::Vec<4, uint32_t>)]; |
| skvx::Vec<4, uint32_t>* fBuffer0Cursor; |
| skvx::Vec<4, uint32_t>* fBuffer1Cursor; |
| skvx::Vec<4, uint32_t>* fBuffer2Cursor; |
| }; |
| |
| // Implement a scanline processor that uses a two-box filter to approximate a Tent filter. |
| // The TentPass is limit to processing sigmas < 2183. |
| class TentPass final : public Pass { |
| public: |
| // NB 2183 is the largest sigma that will not cause a buffer full of 255 mask values to overflow |
| // using the Tent filter. It also limits the size of buffers used hold intermediate values. |
| // Explanation of maximums: |
| // sum0 = window * 255 |
| // sum1 = window * sum0 -> window * window * 255 |
| // |
| // The value window^2 * 255 must fit in a uint32_t. So, |
| // window^2 < 2^32. window = 4104. |
| // |
| // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5) |
| // For window <= 4104, the largest value for sigma is 2183. |
| static PassMaker* MakeMaker(float sigma, SkArenaAlloc* alloc) { |
| SkASSERT(0 <= sigma); |
| int gaussianWindow = SkBlurEngine::BoxBlurWindow(sigma); |
| // This is a naive method of using the window size for the Gaussian blur to calculate the |
| // window size for the Tent blur. This seems to work well in practice. |
| // |
| // We can use a single pixel to generate the effective blur area given a window size. For |
| // the Gaussian blur this is 3 * window size. For the Tent filter this is 2 * window size. |
| int tentWindow = 3 * gaussianWindow / 2; |
| if (tentWindow >= 4104) { |
| return nullptr; |
| } |
| |
| class Maker : public PassMaker { |
| public: |
| explicit Maker(int window, float sigma) : PassMaker{window, sigma} {} |
| Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override { |
| return TentPass::Make(this->window(), buffer, alloc); |
| } |
| |
| size_t bufferSizeBytes() const override { |
| size_t onePassSize = this->window() - 1; |
| // If the window is odd, then there is an obvious middle element. For even sizes 2 |
| // passes are shifted, and the last pass has an extra element. Like this: |
| // S |
| // aaaAaa |
| // bbBbbb |
| // D |
| size_t bufferCount = 2 * onePassSize; |
| return bufferCount * sizeof(skvx::Vec<4, uint32_t>); |
| } |
| }; |
| |
| return alloc->make<Maker>(tentWindow, sigma); |
| } |
| |
| static TentPass* Make(int window, void* buffers, SkArenaAlloc* alloc) { |
| if (window > 4104) { |
| return nullptr; |
| } |
| |
| // We don't need to store the trailing edge pixel in the buffer; |
| int passSize = window - 1; |
| skvx::Vec<4, uint32_t>* buffer0 = static_cast<skvx::Vec<4, uint32_t>*>(buffers); |
| skvx::Vec<4, uint32_t>* buffer1 = buffer0 + passSize; |
| skvx::Vec<4, uint32_t>* buffersEnd = buffer1 + passSize; |
| |
| // Calculating the border is tricky. The border is the distance in pixels between the first |
| // dst pixel and the first src pixel (or the last src pixel and the last dst pixel). |
| // I will go through the odd case which is simpler, and then through the even case. Given a |
| // stack of filters seven wide for the odd case of three passes. |
| // |
| // S |
| // aaaAaaa |
| // bbbBbbb |
| // D |
| // |
| // The furthest changed pixel is when the filters are in the following configuration. |
| // |
| // S |
| // aaaAaaa |
| // bbbBbbb |
| // D |
| // |
| // The A pixel is calculated using the value S, the B uses A, and the D uses B. |
| // So, with a window size of seven the border is nine. In the odd case, the border is |
| // window - 1. |
| // |
| // For even cases the filter stack is more complicated. It uses two passes |
| // of even filters offset from each other. A stack for a width of six looks like |
| // this. |
| // |
| // S |
| // aaaAaa |
| // bbBbbb |
| // D |
| // |
| // The furthest pixel looks like this. |
| // |
| // S |
| // aaaAaa |
| // bbBbbb |
| // D |
| // |
| // For a window of six, the border value is 5. In the even case the border is |
| // window - 1. |
| int border = window - 1; |
| |
| int divisor = window * window; |
| return alloc->make<TentPass>(buffer0, buffer1, buffersEnd, border, divisor); |
| } |
| |
| TentPass(skvx::Vec<4, uint32_t>* buffer0, |
| skvx::Vec<4, uint32_t>* buffer1, |
| skvx::Vec<4, uint32_t>* buffersEnd, |
| int border, |
| int divisor) |
| : Pass{border} |
| , fBuffer0{buffer0} |
| , fBuffer1{buffer1} |
| , fBuffersEnd{buffersEnd} |
| , fDivider(divisor) {} |
| |
| private: |
| void startBlur() override { |
| skvx::Vec<4, uint32_t>{0u, 0u, 0u, 0u}.store(fSum0); |
| auto half = fDivider.half(); |
| skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum1); |
| sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>)); |
| |
| fBuffer0Cursor = fBuffer0; |
| fBuffer1Cursor = fBuffer1; |
| } |
| |
| // TentPass implements the common two pass box filter approximation of Tent filter, |
| // but combines all both passes into a single pass. This approach is facilitated by two |
| // circular buffers the width of the window which track values for trailing edges of each of |
| // both passes. This allows the algorithm to use more precision in the calculation |
| // because the values are not rounded each pass. And this implementation also avoids a trap |
| // that's easy to fall into resulting in blending in too many zeroes near the edge. |
| // |
| // In general, a window sum has the form: |
| // sum_n+1 = sum_n + leading_edge - trailing_edge. |
| // If instead we do the subtraction at the end of the previous iteration, we can just |
| // calculate the sums instead of having to do the subtractions too. |
| // |
| // In previous iteration: |
| // sum_n+1 = sum_n - trailing_edge. |
| // |
| // In this iteration: |
| // sum_n+1 = sum_n + leading_edge. |
| // |
| // Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the |
| // actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the |
| // three passes at the same time has the form: |
| // |
| // sum0_n+1 = sum0_n + leading edge |
| // sum1_n+1 = sum1_n + sum0_n+1 |
| // |
| // sum1_n+1 / window^2 is the new value of the destination pixel. |
| // |
| // Reduce the sums by the trailing edges which were stored in the circular buffers for the |
| // next go around. |
| // |
| // sum1_n+2 = sum1_n+1 - buffer1[i]; |
| // buffer1[i] = sum0; |
| // sum0_n+2 = sum0_n+1 - buffer0[i]; |
| // buffer0[i] = leading edge |
| void blurSegment( |
| int n, const void* src, int srcStride, void* dst, int dstStride) override { |
| const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src); |
| uint32_t* dst32 = reinterpret_cast<uint32_t*>(dst); |
| skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor; |
| skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor; |
| skvx::Vec<4, uint32_t> sum0 = skvx::Vec<4, uint32_t>::Load(fSum0); |
| skvx::Vec<4, uint32_t> sum1 = skvx::Vec<4, uint32_t>::Load(fSum1); |
| |
| // Given an expanded input pixel, move the window ahead using the leadingEdge value. |
| auto processValue = [&](const skvx::Vec<4, uint32_t>& leadingEdge) { |
| sum0 += leadingEdge; |
| sum1 += sum0; |
| |
| skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum1); |
| |
| sum1 -= *buffer1Cursor; |
| *buffer1Cursor = sum0; |
| buffer1Cursor = (buffer1Cursor + 1) < fBuffersEnd ? buffer1Cursor + 1 : fBuffer1; |
| sum0 -= *buffer0Cursor; |
| *buffer0Cursor = leadingEdge; |
| buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0; |
| |
| return skvx::cast<uint8_t>(blurred); |
| }; |
| |
| auto loadEdge = [&](const uint32_t* srcCursor) { |
| return skvx::cast<uint32_t>(skvx::Vec<4, uint8_t>::Load(srcCursor)); |
| }; |
| |
| if (!src32 && !dst32) { |
| while (n --> 0) { |
| (void)processValue(0); |
| } |
| } else if (src32 && !dst32) { |
| while (n --> 0) { |
| (void)processValue(loadEdge(src32)); |
| src32 += srcStride; |
| } |
| } else if (!src32 && dst32) { |
| while (n --> 0) { |
| processValue(0u).store(dst32); |
| dst32 += dstStride; |
| } |
| } else if (src32 && dst32) { |
| while (n --> 0) { |
| processValue(loadEdge(src32)).store(dst32); |
| src32 += srcStride; |
| dst32 += dstStride; |
| } |
| } |
| |
| // Store the state |
| fBuffer0Cursor = buffer0Cursor; |
| fBuffer1Cursor = buffer1Cursor; |
| sum0.store(fSum0); |
| sum1.store(fSum1); |
| } |
| |
| skvx::Vec<4, uint32_t>* const fBuffer0; |
| skvx::Vec<4, uint32_t>* const fBuffer1; |
| skvx::Vec<4, uint32_t>* const fBuffersEnd; |
| const skvx::ScaledDividerU32 fDivider; |
| |
| // blur state |
| char fSum0[sizeof(skvx::Vec<4, uint32_t>)]; |
| char fSum1[sizeof(skvx::Vec<4, uint32_t>)]; |
| skvx::Vec<4, uint32_t>* fBuffer0Cursor; |
| skvx::Vec<4, uint32_t>* fBuffer1Cursor; |
| }; |
| |
| class A8Pass final : public Pass { |
| public: |
| static PassMaker* MakeMaker(float sigma, SkArenaAlloc* alloc) { |
| SkASSERT(0 <= sigma); |
| int possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * SK_DoublePI) / 4 + 0.5)); |
| int window = std::max(1, possibleWindow); |
| |
| class Maker : public PassMaker { |
| public: |
| explicit Maker(int window, float sigma) : PassMaker{window, sigma} {} |
| Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override { |
| return A8Pass::Make(this->window(), buffer, alloc); |
| } |
| |
| size_t bufferSizeBytes() const override { |
| int window = this->window(); |
| size_t pass0Size = window - 1; |
| size_t pass1Size = window - 1; |
| size_t pass2Size = (window & 1) == 1 ? window - 1 : window; |
| return (pass0Size + pass1Size + pass2Size) * sizeof(uint32_t); |
| } |
| }; |
| |
| return alloc->make<Maker>(window, sigma); |
| } |
| |
| static A8Pass* Make(int window, void* buffers, SkArenaAlloc* alloc) { |
| size_t pass0Size = window - 1; |
| size_t pass1Size = window - 1; |
| size_t pass2Size = (window & 1) == 1 ? window - 1 : window; |
| uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End; |
| buffer0 = static_cast<uint32_t*>(buffers); |
| buffer0End = buffer1 = buffer0 + pass0Size; |
| buffer1End = buffer2 = buffer1 + pass1Size; |
| buffer2End = buffer2 + pass2Size; |
| |
| // Calculating the border is tricky. The border is the distance in pixels between the first |
| // dst pixel and the first src pixel (or the last src pixel and the last dst pixel). |
| // I will go through the odd case which is simpler, and then through the even case. Given a |
| // stack of filters seven wide for the odd case of three passes. |
| // |
| // S |
| // aaaAaaa |
| // bbbBbbb |
| // cccCccc |
| // D |
| // |
| // The furthest changed pixel is when the filters are in the following configuration. |
| // |
| // S |
| // aaaAaaa |
| // bbbBbbb |
| // cccCccc |
| // D |
| // |
| // The A pixel is calculated using the value S, the B uses A, and the C uses B, and |
| // finally D is C. So, with a window size of seven the border is nine. In the odd case, the |
| // border is 3*((window - 1)/2). |
| // |
| // For even cases the filter stack is more complicated. The spec specifies two passes |
| // of even filters and a final pass of odd filters. A stack for a width of six looks like |
| // this. |
| // |
| // S |
| // aaaAaa |
| // bbBbbb |
| // cccCccc |
| // D |
| // |
| // The furthest pixel looks like this. |
| // |
| // S |
| // aaaAaa |
| // bbBbbb |
| // cccCccc |
| // D |
| // |
| // For a window of six, the border value is eight. In the even case the border is 3 * |
| // (window/2) - 1. |
| int border = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1; |
| |
| // If the window is odd then the divisor is just window ^ 3 otherwise, |
| // it is window * window * (window + 1) = window ^ 2 + window ^ 3; |
| auto window2 = window * window; |
| auto window3 = window2 * window; |
| auto divisor = (window & 1) == 1 ? window3 : window3 + window2; |
| |
| uint64_t weight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32))); |
| |
| return alloc->make<A8Pass>(weight, buffer0, buffer0End, buffer1, buffer1End, |
| buffer2, buffer2End, border); |
| } |
| |
| A8Pass(uint64_t weight, |
| uint32_t* buffer0, uint32_t* buffer0End, |
| uint32_t* buffer1, uint32_t* buffer1End, |
| uint32_t* buffer2, uint32_t* buffer2End, |
| int border) |
| : Pass{border} |
| , fWeight(weight) |
| , fBuffer0{buffer0} |
| , fBuffer0End{buffer0End} |
| , fBuffer1{buffer1} |
| , fBuffer1End{buffer1End} |
| , fBuffer2{buffer2} |
| , fBuffer2End{buffer2End} {} |
| |
| private: |
| void startBlur() override { |
| fSum0 = 0; |
| fSum1 = 0; |
| fSum2 = 0; |
| |
| sk_bzero(fBuffer0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); |
| |
| fBuffer0Cursor = fBuffer0; |
| fBuffer1Cursor = fBuffer1; |
| fBuffer2Cursor = fBuffer2; |
| } |
| |
| void blurSegment( |
| int n, const void* src, int srcStride, void* dst, int dstStride) override { |
| const uint8_t* src8 = reinterpret_cast<const uint8_t*>(src); |
| uint8_t* dst8 = reinterpret_cast<uint8_t*>(dst); |
| // If n is zero or negative, there's nothing to do. |
| if (n <= 0) { |
| return; |
| } |
| |
| auto buffer0Cursor = fBuffer0Cursor; |
| auto buffer1Cursor = fBuffer1Cursor; |
| auto buffer2Cursor = fBuffer2Cursor; |
| uint32_t sum0 = fSum0; |
| uint32_t sum1 = fSum1; |
| uint32_t sum2 = fSum2; |
| |
| auto processValue = [&](const uint32_t leadingEdge) { |
| sum0 += leadingEdge; sum1 += sum0; sum2 += sum1; |
| |
| const uint8_t blurred = this->finalScale(sum2); |
| |
| sum2 -= *buffer2Cursor; *buffer2Cursor = sum1; |
| buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; |
| sum1 -= *buffer1Cursor; *buffer1Cursor = sum0; |
| buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; |
| sum0 -= *buffer0Cursor; *buffer0Cursor = leadingEdge; |
| buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; |
| |
| return blurred; |
| }; |
| |
| if (!src8 && !dst8) { |
| while (n --> 0) { |
| (void)processValue(0); |
| } |
| } else if (src8 && !dst8) { |
| while (n --> 0) { |
| (void)processValue(*src8); |
| src8 += srcStride; |
| } |
| } else if (!src8 && dst8) { |
| while (n --> 0) { |
| *dst8 = processValue(0); |
| dst8 += dstStride; |
| } |
| } else if (src8 && dst8) { |
| while (n --> 0) { |
| *dst8 = processValue(*src8); |
| src8 += srcStride; |
| dst8 += dstStride; |
| } |
| } |
| |
| // Store the updated state back into member variables for the next call. |
| fBuffer0Cursor = buffer0Cursor; |
| fBuffer1Cursor = buffer1Cursor; |
| fBuffer2Cursor = buffer2Cursor; |
| fSum0 = sum0; |
| fSum1 = sum1; |
| fSum2 = sum2; |
| } |
| |
| inline static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31; |
| |
| uint8_t finalScale(uint32_t sum) const { |
| return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32); |
| } |
| |
| // While input data is only A8 (only needing uint8_t to store), we need to store |
| // single-channel 32-bit data for the accumulation calculations. |
| uint64_t fWeight; |
| uint32_t* fBuffer0; |
| uint32_t* fBuffer0End; |
| uint32_t* fBuffer1; |
| uint32_t* fBuffer1End; |
| uint32_t* fBuffer2; |
| uint32_t* fBuffer2End; |
| |
| uint32_t* fBuffer0Cursor; |
| uint32_t* fBuffer1Cursor; |
| uint32_t* fBuffer2Cursor; |
| uint32_t fSum0; |
| uint32_t fSum1; |
| uint32_t fSum2; |
| }; |
| |
| class RasterA8BlurAlgorithm : public SkBlurEngine::Algorithm { |
| public: |
| // See analysis in description of GaussPass for the max supported sigma. |
| float maxSigma() const override { |
| static constexpr float kMaxSigma = 135.f; |
| SkASSERT(SkBlurEngine::BoxBlurWindow(kMaxSigma) <= 255); |
| return kMaxSigma; |
| } |
| |
| // TODO: Implement CPU backend for different fTileMode. This is still worth doing inline with |
| // the blur; at the moment the tiling is applied via the CropImageFilter and carried as metadata |
| // on the FilterResult. This is forcefully applied in FilterResult::Builder::blur() when |
| // supportsOnlyDecalTiling() returns true. |
| bool supportsOnlyDecalTiling() const override { return true; } |
| |
| sk_sp<SkSpecialImage> blur(SkSize sigma, |
| sk_sp<SkSpecialImage> input, |
| const SkIRect& originalSrcBounds, |
| SkTileMode tileMode, |
| const SkIRect& originalDstBounds) const override { |
| SkASSERT(tileMode == SkTileMode::kDecal); |
| SkASSERT(SkIRect::MakeSize(input->dimensions()).contains(originalSrcBounds)); |
| |
| SkBitmap src; |
| if (!SkSpecialImages::AsBitmap(input.get(), &src)) { |
| return nullptr; // Should only have been called by CPU-backed images |
| } |
| // The blur engine should not have picked this algorithm for a non-8-bit color type. |
| SkASSERT(src.colorType() == kAlpha_8_SkColorType); |
| |
| // 1024 is a place holder guess until more analysis can be done. |
| SkSTArenaAlloc<1024> alloc; |
| auto makeMaker = [&](float sigma) -> PassMaker* { |
| SkASSERT(0 <= sigma && sigma <= 135); // should be guaranteed after map_sigma |
| if (PassMaker* maker = GaussianPass<uint8_t>::MakeMaker(sigma, &alloc)) { |
| return maker; |
| } |
| if (PassMaker* maker = A8Pass::MakeMaker(sigma, &alloc)) { |
| return maker; |
| } |
| SK_ABORT("Sigma is out of range."); |
| }; |
| |
| PassMaker* makerX = makeMaker(sigma.width()); |
| PassMaker* makerY = makeMaker(sigma.height()); |
| |
| return eval_blur_passes<uint8_t>(makerX, makerY, src, originalSrcBounds, |
| originalDstBounds, &alloc); |
| } |
| }; |
| |
| class Raster8888BlurAlgorithm : public SkBlurEngine::Algorithm { |
| public: |
| // See analysis in description of TentPass for the max supported sigma. |
| float maxSigma() const override { |
| // TentPass supports a sigma up to 2183, and was added so that the CPU blur algorithm's |
| // blur radius was as large as that supported by the GPU. GaussPass only supports up to 136. |
| // However, there is a very apparent pop in blur weight when switching from successive box |
| // blurs to the tent filter. The TentPass is preserved for legacy blurs, which do not use |
| // FilterResult::rescale(). However, using kMaxSigma = 135 with the raster SkBlurEngine |
| // ensures that the non-legacy raster blurs will always use the GaussPass implementation. |
| // This is about 6-7x faster on large blurs to rescale a few times to a lower resolution |
| // than it is to evaluate the much larger original window. |
| static constexpr float kMaxSigma = 135.f; |
| SkASSERT(SkBlurEngine::BoxBlurWindow(kMaxSigma) <= 255); // see GaussPass::MakeMaker(). |
| return kMaxSigma; |
| } |
| |
| // TODO: Implement CPU backend for different fTileMode. This is still worth doing inline with |
| // the blur; at the moment the tiling is applied via the CropImageFilter and carried as metadata |
| // on the FilterResult. This is forcefully applied in FilterResult::Builder::blur() when |
| // supportsOnlyDecalTiling() returns true. |
| bool supportsOnlyDecalTiling() const override { return true; } |
| |
| sk_sp<SkSpecialImage> blur(SkSize sigma, |
| sk_sp<SkSpecialImage> input, |
| const SkIRect& originalSrcBounds, |
| SkTileMode tileMode, |
| const SkIRect& originalDstBounds) const override { |
| // TODO: Enable this assert when the TentPass is no longer used for legacy blurs |
| // (which supports blur sigmas larger than what's reported in maxSigma()). |
| // SkASSERT(sigma.width() <= this->maxSigma() && sigma.height() <= this->maxSigma()); |
| SkASSERT(tileMode == SkTileMode::kDecal); |
| |
| SkASSERT(SkIRect::MakeSize(input->dimensions()).contains(originalSrcBounds)); |
| |
| SkBitmap src; |
| if (!SkSpecialImages::AsBitmap(input.get(), &src)) { |
| return nullptr; // Should only have been called by CPU-backed images |
| } |
| // The blur engine should not have picked this algorithm for a non-32-bit color type |
| SkASSERT(src.colorType() == kRGBA_8888_SkColorType || |
| src.colorType() == kBGRA_8888_SkColorType); |
| |
| SkSTArenaAlloc<1024> alloc; |
| auto makeMaker = [&](float sigma) -> PassMaker* { |
| SkASSERT(0 <= sigma && sigma <= 2183); // should be guaranteed after map_sigma |
| #ifndef SK_AVOID_SLOW_RASTER_PIPELINE_BLURS |
| if (PassMaker* maker = GaussianPass<uint32_t>::MakeMaker(sigma, &alloc)) { |
| return maker; |
| } |
| #endif //SK_AVOID_SLOW_RASTER_PIPELINE_BLURS |
| if (PassMaker* maker = ThreeBoxApproxPass::MakeMaker(sigma, &alloc)) { |
| return maker; |
| } |
| if (PassMaker* maker = TentPass::MakeMaker(sigma, &alloc)) { |
| return maker; |
| } |
| SK_ABORT("Sigma is out of range."); |
| }; |
| |
| PassMaker* makerX = makeMaker(sigma.width()); |
| PassMaker* makerY = makeMaker(sigma.height()); |
| |
| return eval_blur_passes<uint32_t>(makerX, makerY, src, originalSrcBounds, |
| originalDstBounds, &alloc); |
| } |
| |
| }; |
| |
| class RasterShaderBlurAlgorithm : public SkShaderBlurAlgorithm { |
| public: |
| sk_sp<SkDevice> makeDevice(const SkImageInfo& imageInfo) const override { |
| // This Device will only be used to draw blurs, so use default SkSurfaceProps. The pixel |
| // geometry and font configuration do not matter. This is not a GPU surface, so DMSAA and |
| // the kAlwaysDither surface property are also irrelevant. |
| return SkBitmapDevice::Create(imageInfo, SkSurfaceProps{}); |
| } |
| }; |
| |
| class RasterBlurEngine : public SkBlurEngine { |
| public: |
| const Algorithm* findAlgorithm(SkSize sigma, SkColorType colorType) const override { |
| // The box blur doesn't actually care about channel order as long as it's 4 8-bit channels. |
| const bool rgba8Blur = colorType == kRGBA_8888_SkColorType || |
| colorType == kBGRA_8888_SkColorType; |
| const bool a8Blur = colorType == kAlpha_8_SkColorType; |
| |
| // For small sigmas, a8 and rgba blurs will use a gaussian blur, otherwise using |
| // box blur approximation. |
| if (a8Blur) { |
| return &fA8BlurAlgorithm; |
| } else if (rgba8Blur) { |
| return &fRGBA8BlurAlgorithm; |
| } else { |
| return &fShaderBlurAlgorithm; |
| } |
| } |
| |
| private: |
| // For non-A8 or non-8888, use the shader algorithm |
| RasterShaderBlurAlgorithm fShaderBlurAlgorithm; |
| // For large blurs with RGBA8 or BGRA8, use consecutive box blurs, |
| // For small 8888 blurs, use gaussian blur |
| Raster8888BlurAlgorithm fRGBA8BlurAlgorithm; |
| // For any large blurs with A8, use consecutive box blurs, |
| // For small a8 blurs use gaussian blur |
| RasterA8BlurAlgorithm fA8BlurAlgorithm; |
| }; |
| |
| } // anonymous namespace |
| |
| const SkBlurEngine* SkBlurEngine::GetRasterBlurEngine() { |
| static const RasterBlurEngine kInstance; |
| return &kInstance; |
| } |
| |
| // SkShaderBlurAlgorithm |
| // ---------------------------------------------------------------------------- |
| |
| void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma, |
| SkISize radius, |
| SkSpan<float> kernel) { |
| // Callers likely had to calculate the radius prior to filling out the kernel value, which is |
| // why it's provided; but make sure it's consistent with expectations. |
| SkASSERT(SkBlurEngine::SigmaToRadius(sigma.width()) == radius.width() && |
| SkBlurEngine::SigmaToRadius(sigma.height()) == radius.height()); |
| |
| // Callers are responsible for downscaling large sigmas to values that can be processed by the |
| // effects, so ensure the radius won't overflow 'kernel' |
| const int width = KernelWidth(radius.width()); |
| const int height = KernelWidth(radius.height()); |
| const size_t kernelSize = SkTo<size_t>(sk_64_mul(width, height)); |
| SkASSERT(kernelSize <= kernel.size()); |
| |
| // And the definition of an identity blur should be sufficient that 2sigma^2 isn't near zero |
| // when there's a non-trivial radius. |
| const float twoSigmaSqrdX = 2.0f * sigma.width() * sigma.width(); |
| const float twoSigmaSqrdY = 2.0f * sigma.height() * sigma.height(); |
| SkASSERT((radius.width() == 0 || !SkScalarNearlyZero(twoSigmaSqrdX)) && |
| (radius.height() == 0 || !SkScalarNearlyZero(twoSigmaSqrdY))); |
| |
| // Setting the denominator to 1 when the radius is 0 automatically converts the remaining math |
| // to the 1D Gaussian distribution. When both radii are 0, it correctly computes a weight of 1.0 |
| const float sigmaXDenom = radius.width() > 0 ? 1.0f / twoSigmaSqrdX : 1.f; |
| const float sigmaYDenom = radius.height() > 0 ? 1.0f / twoSigmaSqrdY : 1.f; |
| |
| float sum = 0.0f; |
| for (int x = 0; x < width; x++) { |
| float xTerm = static_cast<float>(x - radius.width()); |
| xTerm = xTerm * xTerm * sigmaXDenom; |
| for (int y = 0; y < height; y++) { |
| float yTerm = static_cast<float>(y - radius.height()); |
| float xyTerm = std::exp(-(xTerm + yTerm * yTerm * sigmaYDenom)); |
| // Note that the constant term (1/(sqrt(2*pi*sigma^2)) of the Gaussian |
| // is dropped here, since we renormalize the kernel below. |
| kernel[y * width + x] = xyTerm; |
| sum += xyTerm; |
| } |
| } |
| // Normalize the kernel |
| float scale = 1.0f / sum; |
| for (size_t i = 0; i < kernelSize; ++i) { |
| kernel[i] *= scale; |
| } |
| // Zero remainder of the array |
| memset(kernel.data() + kernelSize, 0, sizeof(float)*(kernel.size() - kernelSize)); |
| } |
| |
| void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma, |
| SkISize radii, |
| std::array<SkV4, kMaxSamples/4>& kernel) { |
| static_assert(sizeof(kernel) == sizeof(std::array<float, kMaxSamples>)); |
| static_assert(alignof(float) == alignof(SkV4)); |
| float* data = kernel[0].ptr(); |
| Compute2DBlurKernel(sigma, radii, SkSpan<float>(data, kMaxSamples)); |
| } |
| |
| void SkShaderBlurAlgorithm::Compute2DBlurOffsets(SkISize radius, |
| std::array<SkV4, kMaxSamples/2>& offsets) { |
| const int kernelArea = KernelWidth(radius.width()) * KernelWidth(radius.height()); |
| SkASSERT(kernelArea <= kMaxSamples); |
| |
| SkSpan<float> offsetView{offsets[0].ptr(), kMaxSamples*2}; |
| |
| int i = 0; |
| for (int y = -radius.height(); y <= radius.height(); ++y) { |
| for (int x = -radius.width(); x <= radius.width(); ++x) { |
| offsetView[2*i] = x; |
| offsetView[2*i+1] = y; |
| ++i; |
| } |
| } |
| SkASSERT(i == kernelArea); |
| const int lastValidOffset = 2*(kernelArea - 1); |
| for (; i < kMaxSamples; ++i) { |
| offsetView[2*i] = offsetView[lastValidOffset]; |
| offsetView[2*i+1] = offsetView[lastValidOffset+1]; |
| } |
| } |
| |
| void SkShaderBlurAlgorithm::Compute1DBlurLinearKernel( |
| float sigma, |
| int radius, |
| std::array<SkV4, kMaxSamples/2>& offsetsAndKernel) { |
| SkASSERT(sigma <= kMaxLinearSigma); |
| SkASSERT(radius == SkBlurEngine::SigmaToRadius(sigma)); |
| SkASSERT(LinearKernelWidth(radius) <= kMaxSamples); |
| |
| // Given 2 adjacent gaussian points, they are blended as: Wi * Ci + Wj * Cj. |
| // The GPU will mix Ci and Cj as Ci * (1 - x) + Cj * x during sampling. |
| // Compute W', x such that W' * (Ci * (1 - x) + Cj * x) = Wi * Ci + Wj * Cj. |
| // Solving W' * x = Wj, W' * (1 - x) = Wi: |
| // W' = Wi + Wj |
| // x = Wj / (Wi + Wj) |
| auto get_new_weight = [](float* new_w, float* offset, float wi, float wj) { |
| *new_w = wi + wj; |
| *offset = wj / (wi + wj); |
| }; |
| |
| // Create a temporary standard kernel. The maximum blur radius that can be passed to this |
| // function is (kMaxBlurSamples-1), so make an array large enough to hold the full kernel width. |
| static constexpr int kMaxKernelWidth = KernelWidth(kMaxSamples - 1); |
| SkASSERT(KernelWidth(radius) <= kMaxKernelWidth); |
| std::array<float, kMaxKernelWidth> fullKernel; |
| Compute1DBlurKernel(sigma, radius, SkSpan<float>{fullKernel.data(), KernelWidth(radius)}); |
| |
| std::array<float, kMaxSamples> kernel; |
| std::array<float, kMaxSamples> offsets; |
| // Note that halfsize isn't just size / 2, but radius + 1. This is the size of the output array. |
| int halfSize = LinearKernelWidth(radius); |
| int halfRadius = halfSize / 2; |
| int lowIndex = halfRadius - 1; |
| |
| // Compute1DGaussianKernel produces a full 2N + 1 kernel. Since the kernel can be mirrored, |
| // compute only the upper half and mirror to the lower half. |
| |
| int index = radius; |
| if (radius & 1) { |
| // If N is odd, then use two samples. |
| // The centre texel gets sampled twice, so halve its influence for each sample. |
| // We essentially sample like this: |
| // Texel edges |
| // v v v v |
| // | | | | |
| // \-----^---/ Lower sample |
| // \---^-----/ Upper sample |
| get_new_weight(&kernel[halfRadius], |
| &offsets[halfRadius], |
| fullKernel[index] * 0.5f, |
| fullKernel[index + 1]); |
| kernel[lowIndex] = kernel[halfRadius]; |
| offsets[lowIndex] = -offsets[halfRadius]; |
| index++; |
| lowIndex--; |
| } else { |
| // If N is even, then there are an even number of texels on either side of the centre texel. |
| // Sample the centre texel directly. |
| kernel[halfRadius] = fullKernel[index]; |
| offsets[halfRadius] = 0.0f; |
| } |
| index++; |
| |
| // Every other pair gets one sample. |
| for (int i = halfRadius + 1; i < halfSize; index += 2, i++, lowIndex--) { |
| get_new_weight(&kernel[i], &offsets[i], fullKernel[index], fullKernel[index + 1]); |
| offsets[i] += static_cast<float>(index - radius); |
| |
| // Mirror to lower half. |
| kernel[lowIndex] = kernel[i]; |
| offsets[lowIndex] = -offsets[i]; |
| } |
| |
| // Zero out remaining values in the kernel |
| memset(kernel.data() + halfSize, 0, sizeof(float)*(kMaxSamples - halfSize)); |
| // But copy the last valid offset into the remaining offsets, to increase the chance that |
| // over-iteration in a fragment shader will have a cache hit. |
| for (int i = halfSize; i < kMaxSamples; ++i) { |
| offsets[i] = offsets[halfSize - 1]; |
| } |
| |
| // Interleave into the output array to match the 1D SkSL effect |
| for (int i = 0; i < kMaxSamples / 2; ++i) { |
| offsetsAndKernel[i] = SkV4{offsets[2*i], kernel[2*i], offsets[2*i+1], kernel[2*i+1]}; |
| } |
| } |
| |
| static SkKnownRuntimeEffects::StableKey to_stablekey(int kernelWidth, uint32_t baseKey) { |
| SkASSERT(kernelWidth >= 2 && kernelWidth <= SkShaderBlurAlgorithm::kMaxSamples); |
| switch(kernelWidth) { |
| // Batch on multiples of 4 (skipping width=1, since that can't happen) |
| case 2: [[fallthrough]]; |
| case 3: [[fallthrough]]; |
| case 4: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey); |
| case 5: [[fallthrough]]; |
| case 6: [[fallthrough]]; |
| case 7: [[fallthrough]]; |
| case 8: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+1); |
| case 9: [[fallthrough]]; |
| case 10: [[fallthrough]]; |
| case 11: [[fallthrough]]; |
| case 12: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+2); |
| case 13: [[fallthrough]]; |
| case 14: [[fallthrough]]; |
| case 15: [[fallthrough]]; |
| case 16: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+3); |
| case 17: [[fallthrough]]; |
| case 18: [[fallthrough]]; |
| case 19: [[fallthrough]]; |
| // With larger kernels, batch on multiples of eight so up to 7 wasted samples. |
| case 20: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+4); |
| case 21: [[fallthrough]]; |
| case 22: [[fallthrough]]; |
| case 23: [[fallthrough]]; |
| case 24: [[fallthrough]]; |
| case 25: [[fallthrough]]; |
| case 26: [[fallthrough]]; |
| case 27: [[fallthrough]]; |
| case 28: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+5); |
| default: |
| SkUNREACHABLE; |
| } |
| } |
| |
| const SkRuntimeEffect* SkShaderBlurAlgorithm::GetLinearBlur1DEffect(int radius) { |
| return GetKnownRuntimeEffect( |
| to_stablekey(LinearKernelWidth(radius), |
| static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k1DBlurBase))); |
| } |
| |
| const SkRuntimeEffect* SkShaderBlurAlgorithm::GetBlur2DEffect(const SkISize& radii) { |
| int kernelArea = KernelWidth(radii.width()) * KernelWidth(radii.height()); |
| return GetKnownRuntimeEffect( |
| to_stablekey(kernelArea, |
| static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k2DBlurBase))); |
| } |
| |
| sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::renderBlur(SkRuntimeShaderBuilder* blurEffectBuilder, |
| SkFilterMode filter, |
| SkISize radii, |
| sk_sp<SkSpecialImage> input, |
| const SkIRect& srcRect, |
| SkTileMode tileMode, |
| const SkIRect& dstRect) const { |
| SkImageInfo outII = SkImageInfo::Make({dstRect.width(), dstRect.height()}, |
| input->colorType(), |
| kPremul_SkAlphaType, |
| input->colorInfo().refColorSpace()); |
| sk_sp<SkDevice> device = this->makeDevice(outII); |
| if (!device) { |
| return nullptr; |
| } |
| |
| SkIRect subset = SkIRect::MakeSize(dstRect.size()); |
| device->clipRect(SkRect::Make(subset), SkClipOp::kIntersect, /*aa=*/false); |
| device->setLocalToDevice(SkM44::Translate(-dstRect.left(), -dstRect.top())); |
| |
| // renderBlur() will either mix multiple fast and strict draws to cover dstRect, or will issue |
| // a single strict draw. While the SkShader object changes (really just strict mode), the rest |
| // of the SkPaint remains the same. |
| SkPaint paint; |
| paint.setBlendMode(SkBlendMode::kSrc); |
| |
| SkIRect safeSrcRect = srcRect.makeInset(radii.width(), radii.height()); |
| SkIRect fastDstRect = dstRect; |
| |
| // Only consider the safeSrcRect for shader-based tiling if the original srcRect is different |
| // from the backing store dimensions; when they match the full image we can use HW tiling. |
| if (srcRect != SkIRect::MakeSize(input->backingStoreDimensions())) { |
| if (fastDstRect.intersect(safeSrcRect)) { |
| // If the area of the non-clamping shader is small, it's better to just issue a single |
| // draw that performs shader tiling over the whole dst. |
| if (fastDstRect != dstRect && fastDstRect.width() * fastDstRect.height() < 128 * 128) { |
| fastDstRect.setEmpty(); |
| } |
| } else { |
| fastDstRect.setEmpty(); |
| } |
| } |
| |
| if (!fastDstRect.isEmpty()) { |
| // Fill as much as possible without adding shader tiling logic to each blur sample, |
| // switching to clamp tiling if we aren't in this block due to HW tiling. |
| SkIRect untiledSrcRect = srcRect.makeInset(1, 1); |
| SkTileMode fastTileMode = untiledSrcRect.contains(fastDstRect) ? SkTileMode::kClamp |
| : tileMode; |
| blurEffectBuilder->child("child") = input->asShader( |
| fastTileMode, filter, SkMatrix::I(), /*strict=*/false); |
| paint.setShader(blurEffectBuilder->makeShader()); |
| device->drawRect(SkRect::Make(fastDstRect), paint); |
| } |
| |
| // Switch to a strict shader if there are remaining pixels to fill |
| if (fastDstRect != dstRect) { |
| blurEffectBuilder->child("child") = input->makeSubset(srcRect)->asShader( |
| tileMode, filter, SkMatrix::Translate(srcRect.left(), srcRect.top())); |
| paint.setShader(blurEffectBuilder->makeShader()); |
| } |
| |
| if (fastDstRect.isEmpty()) { |
| // Fill the entire dst with the strict shader |
| device->drawRect(SkRect::Make(dstRect), paint); |
| } else if (fastDstRect != dstRect) { |
| // There will be up to four additional strict draws to fill in the border. The left and |
| // right sides will span the full height of the dst rect. The top and bottom will span |
| // the just the width of the fast interior. Strict border draws with zero width/height |
| // are skipped. |
| auto drawBorder = [&](const SkIRect& r) { |
| if (!r.isEmpty()) { |
| device->drawRect(SkRect::Make(r), paint); |
| } |
| }; |
| |
| drawBorder({dstRect.left(), dstRect.top(), |
| fastDstRect.left(), dstRect.bottom()}); // Left, spanning full height |
| drawBorder({fastDstRect.right(), dstRect.top(), |
| dstRect.right(), dstRect.bottom()}); // Right, spanning full height |
| drawBorder({fastDstRect.left(), dstRect.top(), |
| fastDstRect.right(), fastDstRect.top()}); // Top, spanning inner width |
| drawBorder({fastDstRect.left(), fastDstRect.bottom(), |
| fastDstRect.right(), dstRect.bottom()}); // Bottom, spanning inner width |
| } |
| |
| return device->snapSpecial(subset); |
| } |
| |
| sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur2D(SkSize sigma, |
| SkISize radii, |
| sk_sp<SkSpecialImage> input, |
| const SkIRect& srcRect, |
| SkTileMode tileMode, |
| const SkIRect& dstRect) const { |
| std::array<SkV4, kMaxSamples/4> kernel; |
| std::array<SkV4, kMaxSamples/2> offsets; |
| Compute2DBlurKernel(sigma, radii, kernel); |
| Compute2DBlurOffsets(radii, offsets); |
| |
| SkRuntimeShaderBuilder builder{sk_ref_sp(GetBlur2DEffect(radii))}; |
| builder.uniform("kernel") = kernel; |
| builder.uniform("offsets") = offsets; |
| // NOTE: renderBlur() will configure the "child" shader as needed. The 2D blur effect only |
| // requires nearest-neighbor filtering. |
| return this->renderBlur(&builder, SkFilterMode::kNearest, radii, |
| std::move(input), srcRect, tileMode, dstRect); |
| } |
| |
| sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur1D(float sigma, |
| int radius, |
| SkV2 dir, |
| sk_sp<SkSpecialImage> input, |
| SkIRect srcRect, |
| SkTileMode tileMode, |
| SkIRect dstRect) const { |
| std::array<SkV4, kMaxSamples/2> offsetsAndKernel; |
| Compute1DBlurLinearKernel(sigma, radius, offsetsAndKernel); |
| |
| SkRuntimeShaderBuilder builder{sk_ref_sp(GetLinearBlur1DEffect(radius))}; |
| builder.uniform("offsetsAndKernel") = offsetsAndKernel; |
| builder.uniform("dir") = dir; |
| // NOTE: renderBlur() will configure the "child" shader as needed. The 1D blur effect requires |
| // linear filtering. Reconstruct the appropriate "2D" radii inset value from 'dir'. |
| SkISize radii{dir.x ? radius : 0, dir.y ? radius : 0}; |
| return this->renderBlur(&builder, SkFilterMode::kLinear, radii, |
| std::move(input), srcRect, tileMode, dstRect); |
| } |
| |
| sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::blur(SkSize sigma, |
| sk_sp<SkSpecialImage> src, |
| const SkIRect& srcRect, |
| SkTileMode tileMode, |
| const SkIRect& dstRect) const { |
| SkASSERT(sigma.width() <= kMaxLinearSigma && sigma.height() <= kMaxLinearSigma); |
| |
| int radiusX = SkBlurEngine::SigmaToRadius(sigma.width()); |
| int radiusY = SkBlurEngine::SigmaToRadius(sigma.height()); |
| const int kernelArea = KernelWidth(radiusX) * KernelWidth(radiusY); |
| if (kernelArea <= kMaxSamples && radiusX > 0 && radiusY > 0) { |
| // Use a single-pass 2D kernel if it fits and isn't just 1D already |
| return this->evalBlur2D(sigma, |
| {radiusX, radiusY}, |
| std::move(src), |
| srcRect, |
| tileMode, |
| dstRect); |
| } else { |
| // Use two passes of a 1D kernel (one per axis). |
| SkIRect intermediateSrcRect = srcRect; |
| SkIRect intermediateDstRect = dstRect; |
| if (radiusX > 0) { |
| if (radiusY > 0) { |
| // May need to maintain extra rows above and below 'dstRect' for the follow-up pass. |
| if (tileMode == SkTileMode::kRepeat || tileMode == SkTileMode::kMirror) { |
| // If the srcRect and dstRect are aligned, then we don't need extra rows since |
| // the periodic tiling on srcRect is the same for the intermediate. If they |
| // are not aligned, then outset by the Y radius. |
| const int period = srcRect.height() * (tileMode == SkTileMode::kMirror ? 2 : 1); |
| if (std::abs(dstRect.fTop - srcRect.fTop) % period != 0 || |
| dstRect.height() != srcRect.height()) { |
| intermediateDstRect.outset(0, radiusY); |
| } |
| } else { |
| // For clamp and decal tiling, we outset by the Y radius up to what's available |
| // from the srcRect. Anything beyond that is identical to tiling the |
| // intermediate dst image directly. |
| intermediateDstRect.outset(0, radiusY); |
| intermediateDstRect.fTop = std::max(intermediateDstRect.fTop, srcRect.fTop); |
| intermediateDstRect.fBottom = |
| std::min(intermediateDstRect.fBottom, srcRect.fBottom); |
| if (intermediateDstRect.fTop >= intermediateDstRect.fBottom) { |
| return nullptr; |
| } |
| } |
| } |
| |
| src = this->evalBlur1D(sigma.width(), |
| radiusX, |
| /*dir=*/{1.f, 0.f}, |
| std::move(src), |
| srcRect, |
| tileMode, |
| intermediateDstRect); |
| if (!src) { |
| return nullptr; |
| } |
| intermediateSrcRect = SkIRect::MakeWH(src->width(), src->height()); |
| intermediateDstRect = dstRect.makeOffset(-intermediateDstRect.left(), |
| -intermediateDstRect.top()); |
| } |
| |
| if (radiusY > 0) { |
| src = this->evalBlur1D(sigma.height(), |
| radiusY, |
| /*dir=*/{0.f, 1.f}, |
| std::move(src), |
| intermediateSrcRect, |
| tileMode, |
| intermediateDstRect); |
| } |
| |
| return src; |
| } |
| } |