src/core/SkBlurEngine.cpp - skia - Git at Google

 /*
  * Copyright 2024 Google LLC
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #include "src/core/SkBlurEngine.h"

 #include "include/core/SkAlphaType.h"
 #include "include/core/SkBlendMode.h"
 #include "include/core/SkClipOp.h"
 #include "include/core/SkColorSpace.h"
 #include "include/core/SkImageInfo.h"
 #include "include/core/SkM44.h"
 #include "include/core/SkMatrix.h"
 #include "include/core/SkPaint.h"
 #include "include/core/SkRect.h"
 #include "include/core/SkSamplingOptions.h"
 #include "include/core/SkScalar.h"
 #include "include/core/SkShader.h"
 #include "include/effects/SkRuntimeEffect.h"
 #include "include/private/base/SkAssert.h"
 #include "include/private/base/SkMath.h"
 #include "include/private/base/SkTo.h"
 #include "src/core/SkDevice.h"
 #include "src/core/SkKnownRuntimeEffects.h"
 #include "src/core/SkSpecialImage.h"

 #include <array>
 #include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <utility>

 void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma,
                                                 SkISize radius,
                                                 SkSpan<float> kernel) {
     // Callers likely had to calculate the radius prior to filling out the kernel value, which is
     // why it's provided; but make sure it's consistent with expectations.
     SkASSERT(SkBlurEngine::SigmaToRadius(sigma.width()) == radius.width() &&
              SkBlurEngine::SigmaToRadius(sigma.height()) == radius.height());

     // Callers are responsible for downscaling large sigmas to values that can be processed by the
     // effects, so ensure the radius won't overflow 'kernel'
     const int width = KernelWidth(radius.width());
     const int height = KernelWidth(radius.height());
     const size_t kernelSize = SkTo<size_t>(sk_64_mul(width, height));
     SkASSERT(kernelSize <= kernel.size());

     // And the definition of an identity blur should be sufficient that 2sigma^2 isn't near zero
     // when there's a non-trivial radius.
     const float twoSigmaSqrdX = 2.0f * sigma.width() * sigma.width();
     const float twoSigmaSqrdY = 2.0f * sigma.height() * sigma.height();
     SkASSERT((radius.width() == 0 || !SkScalarNearlyZero(twoSigmaSqrdX)) &&
              (radius.height() == 0 || !SkScalarNearlyZero(twoSigmaSqrdY)));

     // Setting the denominator to 1 when the radius is 0 automatically converts the remaining math
     // to the 1D Gaussian distribution. When both radii are 0, it correctly computes a weight of 1.0
     const float sigmaXDenom = radius.width() > 0 ? 1.0f / twoSigmaSqrdX : 1.f;
     const float sigmaYDenom = radius.height() > 0 ? 1.0f / twoSigmaSqrdY : 1.f;

     float sum = 0.0f;
     for (int x = 0; x < width; x++) {
         float xTerm = static_cast<float>(x - radius.width());
         xTerm = xTerm * xTerm * sigmaXDenom;
         for (int y = 0; y < height; y++) {
             float yTerm = static_cast<float>(y - radius.height());
             float xyTerm = std::exp(-(xTerm + yTerm * yTerm * sigmaYDenom));
             // Note that the constant term (1/(sqrt(2*pi*sigma^2)) of the Gaussian
             // is dropped here, since we renormalize the kernel below.
             kernel[y * width + x] = xyTerm;
             sum += xyTerm;
         }
     }
     // Normalize the kernel
     float scale = 1.0f / sum;
     for (size_t i = 0; i < kernelSize; ++i) {
         kernel[i] *= scale;
     }
     // Zero remainder of the array
     memset(kernel.data() + kernelSize, 0, sizeof(float)*(kernel.size() - kernelSize));
 }

 void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma,
                                                 SkISize radii,
                                                 std::array<SkV4, kMaxSamples/4>& kernel) {
     static_assert(sizeof(kernel) == sizeof(std::array<float, kMaxSamples>));
     static_assert(alignof(float) == alignof(SkV4));
     float* data = kernel[0].ptr();
     Compute2DBlurKernel(sigma, radii, SkSpan<float>(data, kMaxSamples));
 }

 void SkShaderBlurAlgorithm::Compute2DBlurOffsets(SkISize radius,
                                                  std::array<SkV4, kMaxSamples/2>& offsets) {
     const int kernelArea = KernelWidth(radius.width()) * KernelWidth(radius.height());
     SkASSERT(kernelArea <= kMaxSamples);

     SkSpan<float> offsetView{offsets[0].ptr(), kMaxSamples*2};

     int i = 0;
     for (int y = -radius.height(); y <= radius.height(); ++y) {
         for (int x = -radius.width(); x <= radius.width(); ++x) {
             offsetView[2*i]   = x;
             offsetView[2*i+1] = y;
             ++i;
         }
     }
     SkASSERT(i == kernelArea);
     const int lastValidOffset = 2*(kernelArea - 1);
     for (; i < kMaxSamples; ++i) {
         offsetView[2*i]   = offsetView[lastValidOffset];
         offsetView[2*i+1] = offsetView[lastValidOffset+1];
     }
 }

 void SkShaderBlurAlgorithm::Compute1DBlurLinearKernel(
         float sigma,
         int radius,
         std::array<SkV4, kMaxSamples/2>& offsetsAndKernel) {
     SkASSERT(sigma <= kMaxLinearSigma);
     SkASSERT(radius == SkBlurEngine::SigmaToRadius(sigma));
     SkASSERT(LinearKernelWidth(radius) <= kMaxSamples);

     // Given 2 adjacent gaussian points, they are blended as: Wi * Ci + Wj * Cj.
     // The GPU will mix Ci and Cj as Ci * (1 - x) + Cj * x during sampling.
     // Compute W', x such that W' * (Ci * (1 - x) + Cj * x) = Wi * Ci + Wj * Cj.
     // Solving W' * x = Wj, W' * (1 - x) = Wi:
     // W' = Wi + Wj
     // x = Wj / (Wi + Wj)
     auto get_new_weight = [](float* new_w, float* offset, float wi, float wj) {
         *new_w = wi + wj;
         *offset = wj / (wi + wj);
     };

     // Create a temporary standard kernel. The maximum blur radius that can be passed to this
     // function is (kMaxBlurSamples-1), so make an array large enough to hold the full kernel width.
     static constexpr int kMaxKernelWidth = KernelWidth(kMaxSamples - 1);
     SkASSERT(KernelWidth(radius) <= kMaxKernelWidth);
     std::array<float, kMaxKernelWidth> fullKernel;
     Compute1DBlurKernel(sigma, radius, SkSpan<float>{fullKernel.data(), KernelWidth(radius)});

     std::array<float, kMaxSamples> kernel;
     std::array<float, kMaxSamples> offsets;
     // Note that halfsize isn't just size / 2, but radius + 1. This is the size of the output array.
     int halfSize = LinearKernelWidth(radius);
     int halfRadius = halfSize / 2;
     int lowIndex = halfRadius - 1;

     // Compute1DGaussianKernel produces a full 2N + 1 kernel. Since the kernel can be mirrored,
     // compute only the upper half and mirror to the lower half.

     int index = radius;
     if (radius & 1) {
         // If N is odd, then use two samples.
         // The centre texel gets sampled twice, so halve its influence for each sample.
         // We essentially sample like this:
         // Texel edges
         // v    v    v    v
         // |    |    |    |
         // \-----^---/ Lower sample
         //      \---^-----/ Upper sample
         get_new_weight(&kernel[halfRadius],
                        &offsets[halfRadius],
                        fullKernel[index] * 0.5f,
                        fullKernel[index + 1]);
         kernel[lowIndex] = kernel[halfRadius];
         offsets[lowIndex] = -offsets[halfRadius];
         index++;
         lowIndex--;
     } else {
         // If N is even, then there are an even number of texels on either side of the centre texel.
         // Sample the centre texel directly.
         kernel[halfRadius] = fullKernel[index];
         offsets[halfRadius] = 0.0f;
     }
     index++;

     // Every other pair gets one sample.
     for (int i = halfRadius + 1; i < halfSize; index += 2, i++, lowIndex--) {
         get_new_weight(&kernel[i], &offsets[i], fullKernel[index], fullKernel[index + 1]);
         offsets[i] += static_cast<float>(index - radius);

         // Mirror to lower half.
         kernel[lowIndex] = kernel[i];
         offsets[lowIndex] = -offsets[i];
     }

     // Zero out remaining values in the kernel
     memset(kernel.data() + halfSize, 0, sizeof(float)*(kMaxSamples - halfSize));
     // But copy the last valid offset into the remaining offsets, to increase the chance that
     // over-iteration in a fragment shader will have a cache hit.
     for (int i = halfSize; i < kMaxSamples; ++i) {
         offsets[i] = offsets[halfSize - 1];
     }

     // Interleave into the output array to match the 1D SkSL effect
     for (int i = 0; i < kMaxSamples / 2; ++i) {
         offsetsAndKernel[i] = SkV4{offsets[2*i], kernel[2*i], offsets[2*i+1], kernel[2*i+1]};
     }
 }

 static SkKnownRuntimeEffects::StableKey to_stablekey(int kernelWidth, uint32_t baseKey) {
     SkASSERT(kernelWidth >= 2 && kernelWidth <= SkShaderBlurAlgorithm::kMaxSamples);
     switch(kernelWidth) {
         // Batch on multiples of 4 (skipping width=1, since that can't happen)
         case 2:  [[fallthrough]];
         case 3:  [[fallthrough]];
         case 4:  return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey);
         case 5:  [[fallthrough]];
         case 6:  [[fallthrough]];
         case 7:  [[fallthrough]];
         case 8:  return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+1);
         case 9:  [[fallthrough]];
         case 10: [[fallthrough]];
         case 11: [[fallthrough]];
         case 12: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+2);
         case 13: [[fallthrough]];
         case 14: [[fallthrough]];
         case 15: [[fallthrough]];
         case 16: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+3);
         case 17: [[fallthrough]];
         case 18: [[fallthrough]];
         case 19: [[fallthrough]];
         // With larger kernels, batch on multiples of eight so up to 7 wasted samples.
         case 20: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+4);
         case 21: [[fallthrough]];
         case 22: [[fallthrough]];
         case 23: [[fallthrough]];
         case 24: [[fallthrough]];
         case 25: [[fallthrough]];
         case 26: [[fallthrough]];
         case 27: [[fallthrough]];
         case 28: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+5);
         default:
             SkUNREACHABLE;
     }
 }

 const SkRuntimeEffect* SkShaderBlurAlgorithm::GetLinearBlur1DEffect(int radius) {
     return GetKnownRuntimeEffect(
             to_stablekey(LinearKernelWidth(radius),
                          static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k1DBlurBase)));
 }

 const SkRuntimeEffect* SkShaderBlurAlgorithm::GetBlur2DEffect(const SkISize& radii) {
     int kernelArea = KernelWidth(radii.width()) * KernelWidth(radii.height());
     return GetKnownRuntimeEffect(
             to_stablekey(kernelArea,
                          static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k2DBlurBase)));
 }

 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::renderBlur(sk_sp<SkShader> blurEffect,
                                                         const SkIRect& dstRect,
                                                         SkColorType colorType,
                                                         sk_sp<SkColorSpace> colorSpace) const {
     SkImageInfo outII = SkImageInfo::Make({dstRect.width(), dstRect.height()},
                                           colorType, kPremul_SkAlphaType, std::move(colorSpace));
     sk_sp<SkDevice> device = this->makeDevice(outII);
     if (!device) {
         return nullptr;
     }

     // TODO(b/294102201): This is very much like AutoSurface in SkImageFilterTypes.cpp
     SkIRect subset = SkIRect::MakeSize(dstRect.size());
     device->clipRect(SkRect::Make(subset), SkClipOp::kIntersect, /*aa=*/false);
     device->setLocalToDevice(SkM44::Translate(-dstRect.left(), -dstRect.top()));
     SkPaint paint;
     paint.setBlendMode(SkBlendMode::kSrc);
     paint.setShader(std::move(blurEffect));
     device->drawPaint(paint);
     return device->snapSpecial(subset);
 }

 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur2D(SkSize sigma,
                                                         SkISize radii,
                                                         sk_sp<SkSpecialImage> input,
                                                         const SkIRect& srcRect,
                                                         SkTileMode tileMode,
                                                         const SkIRect& dstRect) const {
     std::array<SkV4, kMaxSamples/4> kernel;
     std::array<SkV4, kMaxSamples/2> offsets;
     Compute2DBlurKernel(sigma, radii, kernel);
     Compute2DBlurOffsets(radii, offsets);

     SkRuntimeShaderBuilder builder{sk_ref_sp(GetBlur2DEffect(radii))};
     builder.uniform("kernel") = kernel;
     builder.uniform("offsets") = offsets;
     // TODO(b/294102201): This is very much like FilterResult::asShader()...
     builder.child("child") =
             input->makeSubset(srcRect)->asShader(tileMode,
                                                  SkFilterMode::kNearest,
                                                  SkMatrix::Translate(srcRect.left(),srcRect.top()));

     return this->renderBlur(builder.makeShader(), dstRect,
                             input->colorType(), input->colorInfo().refColorSpace());
 }

 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur1D(float sigma,
                                                         int radius,
                                                         SkV2 dir,
                                                         sk_sp<SkSpecialImage> input,
                                                         SkIRect srcRect,
                                                         SkTileMode tileMode,
                                                         SkIRect dstRect) const {
     std::array<SkV4, kMaxSamples/2> offsetsAndKernel;
     Compute1DBlurLinearKernel(sigma, radius, offsetsAndKernel);

     SkRuntimeShaderBuilder builder{sk_ref_sp(GetLinearBlur1DEffect(radius))};
     builder.uniform("offsetsAndKernel") = offsetsAndKernel;
     builder.uniform("dir") = dir;
     // TODO(b/294102201): This is very much like FilterResult::asShader()...
     builder.child("child") =
             input->makeSubset(srcRect)->asShader(tileMode,
                                                  SkFilterMode::kLinear,
                                                  SkMatrix::Translate(srcRect.left(),srcRect.top()));

     return this->renderBlur(builder.makeShader(), dstRect,
                             input->colorType(), input->colorInfo().refColorSpace());
 }

 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::blur(SkSize sigma,
                                                   sk_sp<SkSpecialImage> src,
                                                   const SkIRect& srcRect,
                                                   SkTileMode tileMode,
                                                   const SkIRect& dstRect) const {
     SkASSERT(sigma.width() <= kMaxLinearSigma &&  sigma.height() <= kMaxLinearSigma);

     int radiusX = SkBlurEngine::SigmaToRadius(sigma.width());
     int radiusY = SkBlurEngine::SigmaToRadius(sigma.height());
     const int kernelArea = KernelWidth(radiusX) * KernelWidth(radiusY);
     if (kernelArea <= kMaxSamples && radiusX > 0 && radiusY > 0) {
         // Use a single-pass 2D kernel if it fits and isn't just 1D already
         return this->evalBlur2D(sigma,
                                 {radiusX, radiusY},
                                 std::move(src),
                                 srcRect,
                                 tileMode,
                                 dstRect);
     } else {
         // Use two passes of a 1D kernel (one per axis).
         SkIRect intermediateSrcRect = srcRect;
         SkIRect intermediateDstRect = dstRect;
         if (radiusX > 0) {
             if (radiusY > 0) {
                 // Outset the output size of dstRect by the radius required for the next Y pass
                 intermediateDstRect.outset(0, radiusY);
                 if (!intermediateDstRect.intersect(srcRect.makeOutset(radiusX, radiusY))) {
                     return nullptr;
                 }
             }

             src = this->evalBlur1D(sigma.width(),
                                    radiusX,
                                    /*dir=*/{1.f, 0.f},
                                    std::move(src),
                                    srcRect,
                                    tileMode,
                                    intermediateDstRect);
             if (!src) {
                 return nullptr;
             }
             intermediateSrcRect = SkIRect::MakeWH(src->width(), src->height());
             intermediateDstRect = dstRect.makeOffset(-intermediateDstRect.left(),
                                                      -intermediateDstRect.top());
         }

         if (radiusY > 0) {
             src = this->evalBlur1D(sigma.height(),
                                    radiusY,
                                    /*dir=*/{0.f, 1.f},
                                    std::move(src),
                                    intermediateSrcRect,
                                    tileMode,
                                    intermediateDstRect);
         }

         return src;
     }
 }
	/*
	* Copyright 2024 Google LLC
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#include "src/core/SkBlurEngine.h"

	#include "include/core/SkAlphaType.h"
	#include "include/core/SkBlendMode.h"
	#include "include/core/SkClipOp.h"
	#include "include/core/SkColorSpace.h"
	#include "include/core/SkImageInfo.h"
	#include "include/core/SkM44.h"
	#include "include/core/SkMatrix.h"
	#include "include/core/SkPaint.h"
	#include "include/core/SkRect.h"
	#include "include/core/SkSamplingOptions.h"
	#include "include/core/SkScalar.h"
	#include "include/core/SkShader.h"
	#include "include/effects/SkRuntimeEffect.h"
	#include "include/private/base/SkAssert.h"
	#include "include/private/base/SkMath.h"
	#include "include/private/base/SkTo.h"
	#include "src/core/SkDevice.h"
	#include "src/core/SkKnownRuntimeEffects.h"
	#include "src/core/SkSpecialImage.h"

	#include <array>
	#include <cmath>
	#include <cstdint>
	#include <cstring>
	#include <utility>

	void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma,
	SkISize radius,
	SkSpan<float> kernel) {
	// Callers likely had to calculate the radius prior to filling out the kernel value, which is
	// why it's provided; but make sure it's consistent with expectations.
	SkASSERT(SkBlurEngine::SigmaToRadius(sigma.width()) == radius.width() &&
	SkBlurEngine::SigmaToRadius(sigma.height()) == radius.height());

	// Callers are responsible for downscaling large sigmas to values that can be processed by the
	// effects, so ensure the radius won't overflow 'kernel'
	const int width = KernelWidth(radius.width());
	const int height = KernelWidth(radius.height());
	const size_t kernelSize = SkTo<size_t>(sk_64_mul(width, height));
	SkASSERT(kernelSize <= kernel.size());

	// And the definition of an identity blur should be sufficient that 2sigma^2 isn't near zero
	// when there's a non-trivial radius.
	const float twoSigmaSqrdX = 2.0f * sigma.width() * sigma.width();
	const float twoSigmaSqrdY = 2.0f * sigma.height() * sigma.height();
	SkASSERT((radius.width() == 0 \|\| !SkScalarNearlyZero(twoSigmaSqrdX)) &&
	(radius.height() == 0 \|\| !SkScalarNearlyZero(twoSigmaSqrdY)));

	// Setting the denominator to 1 when the radius is 0 automatically converts the remaining math
	// to the 1D Gaussian distribution. When both radii are 0, it correctly computes a weight of 1.0
	const float sigmaXDenom = radius.width() > 0 ? 1.0f / twoSigmaSqrdX : 1.f;
	const float sigmaYDenom = radius.height() > 0 ? 1.0f / twoSigmaSqrdY : 1.f;

	float sum = 0.0f;
	for (int x = 0; x < width; x++) {
	float xTerm = static_cast<float>(x - radius.width());
	xTerm = xTerm * xTerm * sigmaXDenom;
	for (int y = 0; y < height; y++) {
	float yTerm = static_cast<float>(y - radius.height());
	float xyTerm = std::exp(-(xTerm + yTerm * yTerm * sigmaYDenom));
	// Note that the constant term (1/(sqrt(2pisigma^2)) of the Gaussian
	// is dropped here, since we renormalize the kernel below.
	kernel[y * width + x] = xyTerm;
	sum += xyTerm;
	}
	}
	// Normalize the kernel
	float scale = 1.0f / sum;
	for (size_t i = 0; i < kernelSize; ++i) {
	kernel[i] *= scale;
	}
	// Zero remainder of the array
	memset(kernel.data() + kernelSize, 0, sizeof(float)*(kernel.size() - kernelSize));
	}

	void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma,
	SkISize radii,
	std::array<SkV4, kMaxSamples/4>& kernel) {
	static_assert(sizeof(kernel) == sizeof(std::array<float, kMaxSamples>));
	static_assert(alignof(float) == alignof(SkV4));
	float* data = kernel[0].ptr();
	Compute2DBlurKernel(sigma, radii, SkSpan<float>(data, kMaxSamples));
	}

	void SkShaderBlurAlgorithm::Compute2DBlurOffsets(SkISize radius,
	std::array<SkV4, kMaxSamples/2>& offsets) {
	const int kernelArea = KernelWidth(radius.width()) * KernelWidth(radius.height());
	SkASSERT(kernelArea <= kMaxSamples);

	SkSpan<float> offsetView{offsets[0].ptr(), kMaxSamples*2};

	int i = 0;
	for (int y = -radius.height(); y <= radius.height(); ++y) {
	for (int x = -radius.width(); x <= radius.width(); ++x) {
	offsetView[2*i] = x;
	offsetView[2*i+1] = y;
	++i;
	}
	}
	SkASSERT(i == kernelArea);
	const int lastValidOffset = 2*(kernelArea - 1);
	for (; i < kMaxSamples; ++i) {
	offsetView[2*i] = offsetView[lastValidOffset];
	offsetView[2*i+1] = offsetView[lastValidOffset+1];
	}
	}

	void SkShaderBlurAlgorithm::Compute1DBlurLinearKernel(
	float sigma,
	int radius,
	std::array<SkV4, kMaxSamples/2>& offsetsAndKernel) {
	SkASSERT(sigma <= kMaxLinearSigma);
	SkASSERT(radius == SkBlurEngine::SigmaToRadius(sigma));
	SkASSERT(LinearKernelWidth(radius) <= kMaxSamples);

	// Given 2 adjacent gaussian points, they are blended as: Wi * Ci + Wj * Cj.
	// The GPU will mix Ci and Cj as Ci * (1 - x) + Cj * x during sampling.
	// Compute W', x such that W' * (Ci * (1 - x) + Cj * x) = Wi * Ci + Wj * Cj.
	// Solving W' * x = Wj, W' * (1 - x) = Wi:
	// W' = Wi + Wj
	// x = Wj / (Wi + Wj)
	auto get_new_weight = [](float* new_w, float* offset, float wi, float wj) {
	*new_w = wi + wj;
	*offset = wj / (wi + wj);
	};

	// Create a temporary standard kernel. The maximum blur radius that can be passed to this
	// function is (kMaxBlurSamples-1), so make an array large enough to hold the full kernel width.
	static constexpr int kMaxKernelWidth = KernelWidth(kMaxSamples - 1);
	SkASSERT(KernelWidth(radius) <= kMaxKernelWidth);
	std::array<float, kMaxKernelWidth> fullKernel;
	Compute1DBlurKernel(sigma, radius, SkSpan<float>{fullKernel.data(), KernelWidth(radius)});

	std::array<float, kMaxSamples> kernel;
	std::array<float, kMaxSamples> offsets;
	// Note that halfsize isn't just size / 2, but radius + 1. This is the size of the output array.
	int halfSize = LinearKernelWidth(radius);
	int halfRadius = halfSize / 2;
	int lowIndex = halfRadius - 1;

	// Compute1DGaussianKernel produces a full 2N + 1 kernel. Since the kernel can be mirrored,
	// compute only the upper half and mirror to the lower half.

	int index = radius;
	if (radius & 1) {
	// If N is odd, then use two samples.
	// The centre texel gets sampled twice, so halve its influence for each sample.
	// We essentially sample like this:
	// Texel edges
	// v v v v
	// \| \| \| \|
	// \-----^---/ Lower sample
	// \---^-----/ Upper sample
	get_new_weight(&kernel[halfRadius],
	&offsets[halfRadius],
	fullKernel[index] * 0.5f,
	fullKernel[index + 1]);
	kernel[lowIndex] = kernel[halfRadius];
	offsets[lowIndex] = -offsets[halfRadius];
	index++;
	lowIndex--;
	} else {
	// If N is even, then there are an even number of texels on either side of the centre texel.
	// Sample the centre texel directly.
	kernel[halfRadius] = fullKernel[index];
	offsets[halfRadius] = 0.0f;
	}
	index++;

	// Every other pair gets one sample.
	for (int i = halfRadius + 1; i < halfSize; index += 2, i++, lowIndex--) {
	get_new_weight(&kernel[i], &offsets[i], fullKernel[index], fullKernel[index + 1]);
	offsets[i] += static_cast<float>(index - radius);

	// Mirror to lower half.
	kernel[lowIndex] = kernel[i];
	offsets[lowIndex] = -offsets[i];
	}

	// Zero out remaining values in the kernel
	memset(kernel.data() + halfSize, 0, sizeof(float)*(kMaxSamples - halfSize));
	// But copy the last valid offset into the remaining offsets, to increase the chance that
	// over-iteration in a fragment shader will have a cache hit.
	for (int i = halfSize; i < kMaxSamples; ++i) {
	offsets[i] = offsets[halfSize - 1];
	}

	// Interleave into the output array to match the 1D SkSL effect
	for (int i = 0; i < kMaxSamples / 2; ++i) {
	offsetsAndKernel[i] = SkV4{offsets[2i], kernel[2i], offsets[2i+1], kernel[2i+1]};
	}
	}

	static SkKnownRuntimeEffects::StableKey to_stablekey(int kernelWidth, uint32_t baseKey) {
	SkASSERT(kernelWidth >= 2 && kernelWidth <= SkShaderBlurAlgorithm::kMaxSamples);
	switch(kernelWidth) {
	// Batch on multiples of 4 (skipping width=1, since that can't happen)
	case 2: [[fallthrough]];
	case 3: [[fallthrough]];
	case 4: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey);
	case 5: [[fallthrough]];
	case 6: [[fallthrough]];
	case 7: [[fallthrough]];
	case 8: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+1);
	case 9: [[fallthrough]];
	case 10: [[fallthrough]];
	case 11: [[fallthrough]];
	case 12: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+2);
	case 13: [[fallthrough]];
	case 14: [[fallthrough]];
	case 15: [[fallthrough]];
	case 16: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+3);
	case 17: [[fallthrough]];
	case 18: [[fallthrough]];
	case 19: [[fallthrough]];
	// With larger kernels, batch on multiples of eight so up to 7 wasted samples.
	case 20: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+4);
	case 21: [[fallthrough]];
	case 22: [[fallthrough]];
	case 23: [[fallthrough]];
	case 24: [[fallthrough]];
	case 25: [[fallthrough]];
	case 26: [[fallthrough]];
	case 27: [[fallthrough]];
	case 28: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+5);
	default:
	SkUNREACHABLE;
	}
	}

	const SkRuntimeEffect* SkShaderBlurAlgorithm::GetLinearBlur1DEffect(int radius) {
	return GetKnownRuntimeEffect(
	to_stablekey(LinearKernelWidth(radius),
	static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k1DBlurBase)));
	}

	const SkRuntimeEffect* SkShaderBlurAlgorithm::GetBlur2DEffect(const SkISize& radii) {
	int kernelArea = KernelWidth(radii.width()) * KernelWidth(radii.height());
	return GetKnownRuntimeEffect(
	to_stablekey(kernelArea,
	static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k2DBlurBase)));
	}

	sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::renderBlur(sk_sp<SkShader> blurEffect,
	const SkIRect& dstRect,
	SkColorType colorType,
	sk_sp<SkColorSpace> colorSpace) const {
	SkImageInfo outII = SkImageInfo::Make({dstRect.width(), dstRect.height()},
	colorType, kPremul_SkAlphaType, std::move(colorSpace));
	sk_sp<SkDevice> device = this->makeDevice(outII);
	if (!device) {
	return nullptr;
	}

	// TODO(b/294102201): This is very much like AutoSurface in SkImageFilterTypes.cpp
	SkIRect subset = SkIRect::MakeSize(dstRect.size());
	device->clipRect(SkRect::Make(subset), SkClipOp::kIntersect, /aa=/false);
	device->setLocalToDevice(SkM44::Translate(-dstRect.left(), -dstRect.top()));
	SkPaint paint;
	paint.setBlendMode(SkBlendMode::kSrc);
	paint.setShader(std::move(blurEffect));
	device->drawPaint(paint);
	return device->snapSpecial(subset);
	}

	sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur2D(SkSize sigma,
	SkISize radii,
	sk_sp<SkSpecialImage> input,
	const SkIRect& srcRect,
	SkTileMode tileMode,
	const SkIRect& dstRect) const {
	std::array<SkV4, kMaxSamples/4> kernel;
	std::array<SkV4, kMaxSamples/2> offsets;
	Compute2DBlurKernel(sigma, radii, kernel);
	Compute2DBlurOffsets(radii, offsets);

	SkRuntimeShaderBuilder builder{sk_ref_sp(GetBlur2DEffect(radii))};
	builder.uniform("kernel") = kernel;
	builder.uniform("offsets") = offsets;
	// TODO(b/294102201): This is very much like FilterResult::asShader()...
	builder.child("child") =
	input->makeSubset(srcRect)->asShader(tileMode,
	SkFilterMode::kNearest,
	SkMatrix::Translate(srcRect.left(),srcRect.top()));

	return this->renderBlur(builder.makeShader(), dstRect,
	input->colorType(), input->colorInfo().refColorSpace());
	}

	sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur1D(float sigma,
	int radius,
	SkV2 dir,
	sk_sp<SkSpecialImage> input,
	SkIRect srcRect,
	SkTileMode tileMode,
	SkIRect dstRect) const {
	std::array<SkV4, kMaxSamples/2> offsetsAndKernel;
	Compute1DBlurLinearKernel(sigma, radius, offsetsAndKernel);

	SkRuntimeShaderBuilder builder{sk_ref_sp(GetLinearBlur1DEffect(radius))};
	builder.uniform("offsetsAndKernel") = offsetsAndKernel;
	builder.uniform("dir") = dir;
	// TODO(b/294102201): This is very much like FilterResult::asShader()...
	builder.child("child") =
	input->makeSubset(srcRect)->asShader(tileMode,
	SkFilterMode::kLinear,
	SkMatrix::Translate(srcRect.left(),srcRect.top()));

	return this->renderBlur(builder.makeShader(), dstRect,
	input->colorType(), input->colorInfo().refColorSpace());
	}

	sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::blur(SkSize sigma,
	sk_sp<SkSpecialImage> src,
	const SkIRect& srcRect,
	SkTileMode tileMode,
	const SkIRect& dstRect) const {
	SkASSERT(sigma.width() <= kMaxLinearSigma && sigma.height() <= kMaxLinearSigma);

	int radiusX = SkBlurEngine::SigmaToRadius(sigma.width());
	int radiusY = SkBlurEngine::SigmaToRadius(sigma.height());
	const int kernelArea = KernelWidth(radiusX) * KernelWidth(radiusY);
	if (kernelArea <= kMaxSamples && radiusX > 0 && radiusY > 0) {
	// Use a single-pass 2D kernel if it fits and isn't just 1D already
	return this->evalBlur2D(sigma,
	{radiusX, radiusY},
	std::move(src),
	srcRect,
	tileMode,
	dstRect);
	} else {
	// Use two passes of a 1D kernel (one per axis).
	SkIRect intermediateSrcRect = srcRect;
	SkIRect intermediateDstRect = dstRect;
	if (radiusX > 0) {
	if (radiusY > 0) {
	// Outset the output size of dstRect by the radius required for the next Y pass
	intermediateDstRect.outset(0, radiusY);
	if (!intermediateDstRect.intersect(srcRect.makeOutset(radiusX, radiusY))) {
	return nullptr;
	}
	}

	src = this->evalBlur1D(sigma.width(),
	radiusX,
	/dir=/{1.f, 0.f},
	std::move(src),
	srcRect,
	tileMode,
	intermediateDstRect);
	if (!src) {
	return nullptr;
	}
	intermediateSrcRect = SkIRect::MakeWH(src->width(), src->height());
	intermediateDstRect = dstRect.makeOffset(-intermediateDstRect.left(),
	-intermediateDstRect.top());
	}

	if (radiusY > 0) {
	src = this->evalBlur1D(sigma.height(),
	radiusY,
	/dir=/{0.f, 1.f},
	std::move(src),
	intermediateSrcRect,
	tileMode,
	intermediateDstRect);
	}

	return src;
	}
	}