src/gpu/BlurUtils.cpp - skia - Git at Google

 /*
  * Copyright 2023 Google LLC
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
 #include "src/gpu/BlurUtils.h"

 #include "include/core/SkBitmap.h"
 #include "include/core/SkColorPriv.h"
 #include "include/core/SkImageInfo.h"
 #include "include/core/SkM44.h"
 #include "include/core/SkScalar.h"
 #include "include/core/SkSize.h"
 #include "include/private/base/SkAssert.h"
 #include "include/private/base/SkMath.h"
 #include "include/private/base/SkTemplates.h"
 #include "include/private/base/SkTo.h"
 #include "src/base/SkMathPriv.h"
 #include "src/core/SkKnownRuntimeEffects.h"

 #include <algorithm>
 #include <array>
 #include <cmath>
 #include <cstdint>
 #include <cstring>

 namespace skgpu {

 void Compute2DBlurKernel(SkSize sigma,
                          SkISize radius,
                          SkSpan<float> kernel) {
     // Callers likely had to calculate the radius prior to filling out the kernel value, which is
     // why it's provided; but make sure it's consistent with expectations.
     SkASSERT(BlurSigmaRadius(sigma.width()) == radius.width() &&
              BlurSigmaRadius(sigma.height()) == radius.height());

     // Callers are responsible for downscaling large sigmas to values that can be processed by the
     // effects, so ensure the radius won't overflow 'kernel'
     const int width = BlurKernelWidth(radius.width());
     const int height = BlurKernelWidth(radius.height());
     const size_t kernelSize = SkTo<size_t>(sk_64_mul(width, height));
     SkASSERT(kernelSize <= kernel.size());

     // And the definition of an identity blur should be sufficient that 2sigma^2 isn't near zero
     // when there's a non-trivial radius.
     const float twoSigmaSqrdX = 2.0f * sigma.width() * sigma.width();
     const float twoSigmaSqrdY = 2.0f * sigma.height() * sigma.height();
     SkASSERT((radius.width() == 0 || !SkScalarNearlyZero(twoSigmaSqrdX)) &&
              (radius.height() == 0 || !SkScalarNearlyZero(twoSigmaSqrdY)));

     // Setting the denominator to 1 when the radius is 0 automatically converts the remaining math
     // to the 1D Gaussian distribution. When both radii are 0, it correctly computes a weight of 1.0
     const float sigmaXDenom = radius.width() > 0 ? 1.0f / twoSigmaSqrdX : 1.f;
     const float sigmaYDenom = radius.height() > 0 ? 1.0f / twoSigmaSqrdY : 1.f;

     float sum = 0.0f;
     for (int x = 0; x < width; x++) {
         float xTerm = static_cast<float>(x - radius.width());
         xTerm = xTerm * xTerm * sigmaXDenom;
         for (int y = 0; y < height; y++) {
             float yTerm = static_cast<float>(y - radius.height());
             float xyTerm = std::exp(-(xTerm + yTerm * yTerm * sigmaYDenom));
             // Note that the constant term (1/(sqrt(2*pi*sigma^2)) of the Gaussian
             // is dropped here, since we renormalize the kernel below.
             kernel[y * width + x] = xyTerm;
             sum += xyTerm;
         }
     }
     // Normalize the kernel
     float scale = 1.0f / sum;
     for (size_t i = 0; i < kernelSize; ++i) {
         kernel[i] *= scale;
     }
     // Zero remainder of the array
     memset(kernel.data() + kernelSize, 0, sizeof(float)*(kernel.size() - kernelSize));
 }

 void Compute2DBlurKernel(SkSize sigma,
                          SkISize radii,
                          std::array<SkV4, kMaxBlurSamples/4>& kernel) {
     static_assert(sizeof(kernel) == sizeof(std::array<float, kMaxBlurSamples>));
     static_assert(alignof(float) == alignof(SkV4));
     float* data = kernel[0].ptr();
     Compute2DBlurKernel(sigma, radii, SkSpan<float>(data, kMaxBlurSamples));
 }

 void Compute2DBlurOffsets(SkISize radius, std::array<SkV4, kMaxBlurSamples/2>& offsets) {
     const int kernelArea = BlurKernelWidth(radius.width()) * BlurKernelWidth(radius.height());
     SkASSERT(kernelArea <= kMaxBlurSamples);

     SkSpan<float> offsetView{offsets[0].ptr(), kMaxBlurSamples*2};

     int i = 0;
     for (int y = -radius.height(); y <= radius.height(); ++y) {
         for (int x = -radius.width(); x <= radius.width(); ++x) {
             offsetView[2*i]   = x;
             offsetView[2*i+1] = y;
             ++i;
         }
     }
     SkASSERT(i == kernelArea);
     const int lastValidOffset = 2*(kernelArea - 1);
     for (; i < kMaxBlurSamples; ++i) {
         offsetView[2*i]   = offsetView[lastValidOffset];
         offsetView[2*i+1] = offsetView[lastValidOffset+1];
     }
 }

 void Compute1DBlurLinearKernel(float sigma,
                                int radius,
                                std::array<SkV4, kMaxBlurSamples/2>& offsetsAndKernel) {
     SkASSERT(sigma <= kMaxLinearBlurSigma);
     SkASSERT(radius == BlurSigmaRadius(sigma));
     SkASSERT(BlurLinearKernelWidth(radius) <= kMaxBlurSamples);

     // Given 2 adjacent gaussian points, they are blended as: Wi * Ci + Wj * Cj.
     // The GPU will mix Ci and Cj as Ci * (1 - x) + Cj * x during sampling.
     // Compute W', x such that W' * (Ci * (1 - x) + Cj * x) = Wi * Ci + Wj * Cj.
     // Solving W' * x = Wj, W' * (1 - x) = Wi:
     // W' = Wi + Wj
     // x = Wj / (Wi + Wj)
     auto get_new_weight = [](float* new_w, float* offset, float wi, float wj) {
         *new_w = wi + wj;
         *offset = wj / (wi + wj);
     };

     // Create a temporary standard kernel. The maximum blur radius that can be passed to this
     // function is (kMaxBlurSamples-1), so make an array large enough to hold the full kernel width.
     static constexpr int kMaxKernelWidth = BlurKernelWidth(kMaxBlurSamples - 1);
     SkASSERT(BlurKernelWidth(radius) <= kMaxKernelWidth);
     std::array<float, kMaxKernelWidth> fullKernel;
     Compute1DBlurKernel(sigma, radius, SkSpan<float>{fullKernel.data(), BlurKernelWidth(radius)});

     std::array<float, kMaxBlurSamples> kernel;
     std::array<float, kMaxBlurSamples> offsets;
     // Note that halfsize isn't just size / 2, but radius + 1. This is the size of the output array.
     int halfSize = skgpu::BlurLinearKernelWidth(radius);
     int halfRadius = halfSize / 2;
     int lowIndex = halfRadius - 1;

     // Compute1DGaussianKernel produces a full 2N + 1 kernel. Since the kernel can be mirrored,
     // compute only the upper half and mirror to the lower half.

     int index = radius;
     if (radius & 1) {
         // If N is odd, then use two samples.
         // The centre texel gets sampled twice, so halve its influence for each sample.
         // We essentially sample like this:
         // Texel edges
         // v    v    v    v
         // |    |    |    |
         // \-----^---/ Lower sample
         //      \---^-----/ Upper sample
         get_new_weight(&kernel[halfRadius],
                        &offsets[halfRadius],
                        fullKernel[index] * 0.5f,
                        fullKernel[index + 1]);
         kernel[lowIndex] = kernel[halfRadius];
         offsets[lowIndex] = -offsets[halfRadius];
         index++;
         lowIndex--;
     } else {
         // If N is even, then there are an even number of texels on either side of the centre texel.
         // Sample the centre texel directly.
         kernel[halfRadius] = fullKernel[index];
         offsets[halfRadius] = 0.0f;
     }
     index++;

     // Every other pair gets one sample.
     for (int i = halfRadius + 1; i < halfSize; index += 2, i++, lowIndex--) {
         get_new_weight(&kernel[i], &offsets[i], fullKernel[index], fullKernel[index + 1]);
         offsets[i] += static_cast<float>(index - radius);

         // Mirror to lower half.
         kernel[lowIndex] = kernel[i];
         offsets[lowIndex] = -offsets[i];
     }

     // Zero out remaining values in the kernel
     memset(kernel.data() + halfSize, 0, sizeof(float)*(kMaxBlurSamples - halfSize));
     // But copy the last valid offset into the remaining offsets, to increase the chance that
     // over-iteration in a fragment shader will have a cache hit.
     for (int i = halfSize; i < kMaxBlurSamples; ++i) {
         offsets[i] = offsets[halfSize - 1];
     }

     // Interleave into the output array to match the 1D SkSL effect
     for (int i = 0; i < skgpu::kMaxBlurSamples / 2; ++i) {
         offsetsAndKernel[i] = SkV4{offsets[2*i], kernel[2*i], offsets[2*i+1], kernel[2*i+1]};
     }
 }

 static SkKnownRuntimeEffects::StableKey to_stablekey(int kernelWidth, uint32_t baseKey) {
     SkASSERT(kernelWidth >= 2 && kernelWidth <= kMaxBlurSamples);
     switch(kernelWidth) {
         // Batch on multiples of 4 (skipping width=1, since that can't happen)
         case 2:  [[fallthrough]];
         case 3:  [[fallthrough]];
         case 4:  return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey);
         case 5:  [[fallthrough]];
         case 6:  [[fallthrough]];
         case 7:  [[fallthrough]];
         case 8:  return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+1);
         case 9:  [[fallthrough]];
         case 10: [[fallthrough]];
         case 11: [[fallthrough]];
         case 12: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+2);
         case 13: [[fallthrough]];
         case 14: [[fallthrough]];
         case 15: [[fallthrough]];
         case 16: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+3);
         case 17: [[fallthrough]];
         case 18: [[fallthrough]];
         case 19: [[fallthrough]];
         // With larger kernels, batch on multiples of eight so up to 7 wasted samples.
         case 20: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+4);
         case 21: [[fallthrough]];
         case 22: [[fallthrough]];
         case 23: [[fallthrough]];
         case 24: [[fallthrough]];
         case 25: [[fallthrough]];
         case 26: [[fallthrough]];
         case 27: [[fallthrough]];
         case 28: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+5);
         default:
             SkUNREACHABLE;
     }
 }

 const SkRuntimeEffect* GetLinearBlur1DEffect(int radius) {
     return GetKnownRuntimeEffect(
             to_stablekey(BlurLinearKernelWidth(radius),
                          static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k1DBlurBase)));
 }

 const SkRuntimeEffect* GetBlur2DEffect(const SkISize& radii) {
     int kernelArea = BlurKernelWidth(radii.width()) * BlurKernelWidth(radii.height());
     return GetKnownRuntimeEffect(
             to_stablekey(kernelArea,
                          static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k2DBlurBase)));
 }

 ///////////////////////////////////////////////////////////////////////////////
 //  Rect Blur
 ///////////////////////////////////////////////////////////////////////////////

 // TODO: it seems like there should be some synergy with SkBlurMask::ComputeBlurProfile
 // TODO: maybe cache this on the cpu side?
 SkBitmap CreateIntegralTable(float sixSigma) {
     SkBitmap table;

     int width = ComputeIntegralTableWidth(sixSigma);
     if (width == 0) {
         return table;
     }

     if (!table.tryAllocPixels(SkImageInfo::MakeA8(width, 1))) {
         return table;
     }
     *table.getAddr8(0, 0) = 255;
     const float invWidth = 1.f / width;
     for (int i = 1; i < width - 1; ++i) {
         float x = (i + 0.5f) * invWidth;
         x = (-6 * x + 3) * SK_ScalarRoot2Over2;
         float integral = 0.5f * (std::erf(x) + 1.f);
         *table.getAddr8(i, 0) = SkToU8(sk_float_round2int(255.f * integral));
     }

     *table.getAddr8(width - 1, 0) = 0;
     table.setImmutable();
     return table;
 }

 int ComputeIntegralTableWidth(float sixSigma) {
     // Check for NaN
     if (std::isnan(sixSigma)) {
         return 0;
     }
     // Avoid overflow, covers both multiplying by 2 and finding next power of 2:
     // 2*((2^31-1)/4 + 1) = 2*(2^29-1) + 2 = 2^30 and SkNextPow2(2^30) = 2^30
     if (sixSigma > SK_MaxS32 / 4 + 1) {
         return 0;
     }
     // The texture we're producing represents the integral of a normal distribution over a
     // six-sigma range centered at zero. We want enough resolution so that the linear
     // interpolation done in texture lookup doesn't introduce noticeable artifacts. We
     // conservatively choose to have 2 texels for each dst pixel.
     int minWidth = 2 * ((int)std::ceil(sixSigma));
     // Bin by powers of 2 with a minimum so we get good profile reuse.
     return std::max(SkNextPow2(minWidth), 32);
 }

 ///////////////////////////////////////////////////////////////////////////////
 //  Circle Blur
 ///////////////////////////////////////////////////////////////////////////////

 // Computes an unnormalized half kernel (right side). Returns the summation of all the half
 // kernel values.
 static float make_unnormalized_half_kernel(float* halfKernel, int halfKernelSize, float sigma) {
     const float invSigma = 1.0f / sigma;
     const float b = -0.5f * invSigma * invSigma;
     float tot = 0.0f;
     // Compute half kernel values at half pixel steps out from the center.
     float t = 0.5f;
     for (int i = 0; i < halfKernelSize; ++i) {
         float value = expf(t * t * b);
         tot += value;
         halfKernel[i] = value;
         t += 1.0f;
     }
     return tot;
 }

 // Create a Gaussian half-kernel (right side) and a summed area table given a sigma and number
 // of discrete steps. The half kernel is normalized to sum to 0.5.
 static void make_half_kernel_and_summed_table(float* halfKernel,
                                               float* summedHalfKernel,
                                               int halfKernelSize,
                                               float sigma) {
     // The half kernel should sum to 0.5 not 1.0.
     const float tot = 2.0f * make_unnormalized_half_kernel(halfKernel, halfKernelSize, sigma);
     float sum = 0.0f;
     for (int i = 0; i < halfKernelSize; ++i) {
         halfKernel[i] /= tot;
         sum += halfKernel[i];
         summedHalfKernel[i] = sum;
     }
 }

 // Applies the 1D half kernel vertically at points along the x axis to a circle centered at the
 // origin with radius circleR.
 static void apply_kernel_in_y(float* results,
                               int numSteps,
                               float firstX,
                               float circleR,
                               int halfKernelSize,
                               const float* summedHalfKernelTable) {
     float x = firstX;
     for (int i = 0; i < numSteps; ++i, x += 1.0f) {
         if (x < -circleR || x > circleR) {
             results[i] = 0;
             continue;
         }
         float y = sqrtf(circleR * circleR - x * x);
         // In the column at x we exit the circle at +y and -y
         // The summed table entry j is actually reflects an offset of j + 0.5.
         y -= 0.5f;
         int yInt = SkScalarFloorToInt(y);
         SkASSERT(yInt >= -1);
         if (y < 0) {
             results[i] = (y + 0.5f) * summedHalfKernelTable[0];
         } else if (yInt >= halfKernelSize - 1) {
             results[i] = 0.5f;
         } else {
             float yFrac = y - yInt;
             results[i] = (1.0f - yFrac) * summedHalfKernelTable[yInt] +
                          yFrac * summedHalfKernelTable[yInt + 1];
         }
     }
 }

 // Apply a Gaussian at point (evalX, 0) to a circle centered at the origin with radius circleR.
 // This relies on having a half kernel computed for the Gaussian and a table of applications of
 // the half kernel in y to columns at (evalX - halfKernel, evalX - halfKernel + 1, ..., evalX +
 // halfKernel) passed in as yKernelEvaluations.
 static uint8_t eval_at(float evalX,
                        float circleR,
                        const float* halfKernel,
                        int halfKernelSize,
                        const float* yKernelEvaluations) {
     float acc = 0;

     float x = evalX - halfKernelSize;
     for (int i = 0; i < halfKernelSize; ++i, x += 1.0f) {
         if (x < -circleR || x > circleR) {
             continue;
         }
         float verticalEval = yKernelEvaluations[i];
         acc += verticalEval * halfKernel[halfKernelSize - i - 1];
     }
     for (int i = 0; i < halfKernelSize; ++i, x += 1.0f) {
         if (x < -circleR || x > circleR) {
             continue;
         }
         float verticalEval = yKernelEvaluations[i + halfKernelSize];
         acc += verticalEval * halfKernel[i];
     }
     // Since we applied a half kernel in y we multiply acc by 2 (the circle is symmetric about
     // the x axis).
     return SkUnitScalarClampToByte(2.0f * acc);
 }

 // This function creates a profile of a blurred circle. It does this by computing a kernel for
 // half the Gaussian and a matching summed area table. The summed area table is used to compute
 // an array of vertical applications of the half kernel to the circle along the x axis. The
 // table of y evaluations has 2 * k + n entries where k is the size of the half kernel and n is
 // the size of the profile being computed. Then for each of the n profile entries we walk out k
 // steps in each horizontal direction multiplying the corresponding y evaluation by the half
 // kernel entry and sum these values to compute the profile entry.
 SkBitmap CreateCircleProfile(float sigma, float radius, int profileWidth) {
     SkBitmap bitmap;
     if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
         return bitmap;
     }

     uint8_t* profile = bitmap.getAddr8(0, 0);

     const int numSteps = profileWidth;

     // The full kernel is 6 sigmas wide.
     int halfKernelSize = SkScalarCeilToInt(6.0f * sigma);
     // Round up to next multiple of 2 and then divide by 2.
     halfKernelSize = ((halfKernelSize + 1) & ~1) >> 1;

     // Number of x steps at which to apply kernel in y to cover all the profile samples in x.
     const int numYSteps = numSteps + 2 * halfKernelSize;

     skia_private::AutoTArray<float> bulkAlloc(halfKernelSize + halfKernelSize + numYSteps);
     float* halfKernel = bulkAlloc.get();
     float* summedKernel = bulkAlloc.get() + halfKernelSize;
     float* yEvals = bulkAlloc.get() + 2 * halfKernelSize;
     make_half_kernel_and_summed_table(halfKernel, summedKernel, halfKernelSize, sigma);

     float firstX = -halfKernelSize + 0.5f;
     apply_kernel_in_y(yEvals, numYSteps, firstX, radius, halfKernelSize, summedKernel);

     for (int i = 0; i < numSteps - 1; ++i) {
         float evalX = i + 0.5f;
         profile[i] = eval_at(evalX, radius, halfKernel, halfKernelSize, yEvals + i);
     }
     // Ensure the tail of the Gaussian goes to zero.
     profile[numSteps - 1] = 0;

     return bitmap;
 }

 SkBitmap CreateHalfPlaneProfile(int profileWidth) {
     SkASSERT(!(profileWidth & 0x1));

     SkBitmap bitmap;
     if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
         return bitmap;
     }

     uint8_t* profile = bitmap.getAddr8(0, 0);

     // The full kernel is 6 sigmas wide.
     const float sigma = profileWidth / 6.0f;
     const int halfKernelSize = profileWidth / 2;

     skia_private::AutoTArray<float> halfKernel(halfKernelSize);

     // The half kernel should sum to 0.5.
     const float tot = 2.0f * make_unnormalized_half_kernel(halfKernel.get(), halfKernelSize, sigma);
     float sum = 0.0f;
     // Populate the profile from the right edge to the middle.
     for (int i = 0; i < halfKernelSize; ++i) {
         halfKernel[halfKernelSize - i - 1] /= tot;
         sum += halfKernel[halfKernelSize - i - 1];
         profile[profileWidth - i - 1] = SkUnitScalarClampToByte(sum);
     }
     // Populate the profile from the middle to the left edge (by flipping the half kernel and
     // continuing the summation).
     for (int i = 0; i < halfKernelSize; ++i) {
         sum += halfKernel[i];
         profile[halfKernelSize - i - 1] = SkUnitScalarClampToByte(sum);
     }
     // Ensure the tail of the Gaussian goes to zero.
     profile[profileWidth - 1] = 0;

     return bitmap;
 }

 } // namespace skgpu
	/*
	* Copyright 2023 Google LLC
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/
	#include "src/gpu/BlurUtils.h"

	#include "include/core/SkBitmap.h"
	#include "include/core/SkColorPriv.h"
	#include "include/core/SkImageInfo.h"
	#include "include/core/SkM44.h"
	#include "include/core/SkScalar.h"
	#include "include/core/SkSize.h"
	#include "include/private/base/SkAssert.h"
	#include "include/private/base/SkMath.h"
	#include "include/private/base/SkTemplates.h"
	#include "include/private/base/SkTo.h"
	#include "src/base/SkMathPriv.h"
	#include "src/core/SkKnownRuntimeEffects.h"

	#include <algorithm>
	#include <array>
	#include <cmath>
	#include <cstdint>
	#include <cstring>

	namespace skgpu {

	void Compute2DBlurKernel(SkSize sigma,
	SkISize radius,
	SkSpan<float> kernel) {
	// Callers likely had to calculate the radius prior to filling out the kernel value, which is
	// why it's provided; but make sure it's consistent with expectations.
	SkASSERT(BlurSigmaRadius(sigma.width()) == radius.width() &&
	BlurSigmaRadius(sigma.height()) == radius.height());

	// Callers are responsible for downscaling large sigmas to values that can be processed by the
	// effects, so ensure the radius won't overflow 'kernel'
	const int width = BlurKernelWidth(radius.width());
	const int height = BlurKernelWidth(radius.height());
	const size_t kernelSize = SkTo<size_t>(sk_64_mul(width, height));
	SkASSERT(kernelSize <= kernel.size());

	// And the definition of an identity blur should be sufficient that 2sigma^2 isn't near zero
	// when there's a non-trivial radius.
	const float twoSigmaSqrdX = 2.0f * sigma.width() * sigma.width();
	const float twoSigmaSqrdY = 2.0f * sigma.height() * sigma.height();
	SkASSERT((radius.width() == 0 \|\| !SkScalarNearlyZero(twoSigmaSqrdX)) &&
	(radius.height() == 0 \|\| !SkScalarNearlyZero(twoSigmaSqrdY)));

	// Setting the denominator to 1 when the radius is 0 automatically converts the remaining math
	// to the 1D Gaussian distribution. When both radii are 0, it correctly computes a weight of 1.0
	const float sigmaXDenom = radius.width() > 0 ? 1.0f / twoSigmaSqrdX : 1.f;
	const float sigmaYDenom = radius.height() > 0 ? 1.0f / twoSigmaSqrdY : 1.f;

	float sum = 0.0f;
	for (int x = 0; x < width; x++) {
	float xTerm = static_cast<float>(x - radius.width());
	xTerm = xTerm * xTerm * sigmaXDenom;
	for (int y = 0; y < height; y++) {
	float yTerm = static_cast<float>(y - radius.height());
	float xyTerm = std::exp(-(xTerm + yTerm * yTerm * sigmaYDenom));
	// Note that the constant term (1/(sqrt(2pisigma^2)) of the Gaussian
	// is dropped here, since we renormalize the kernel below.
	kernel[y * width + x] = xyTerm;
	sum += xyTerm;
	}
	}
	// Normalize the kernel
	float scale = 1.0f / sum;
	for (size_t i = 0; i < kernelSize; ++i) {
	kernel[i] *= scale;
	}
	// Zero remainder of the array
	memset(kernel.data() + kernelSize, 0, sizeof(float)*(kernel.size() - kernelSize));
	}

	void Compute2DBlurKernel(SkSize sigma,
	SkISize radii,
	std::array<SkV4, kMaxBlurSamples/4>& kernel) {
	static_assert(sizeof(kernel) == sizeof(std::array<float, kMaxBlurSamples>));
	static_assert(alignof(float) == alignof(SkV4));
	float* data = kernel[0].ptr();
	Compute2DBlurKernel(sigma, radii, SkSpan<float>(data, kMaxBlurSamples));
	}

	void Compute2DBlurOffsets(SkISize radius, std::array<SkV4, kMaxBlurSamples/2>& offsets) {
	const int kernelArea = BlurKernelWidth(radius.width()) * BlurKernelWidth(radius.height());
	SkASSERT(kernelArea <= kMaxBlurSamples);

	SkSpan<float> offsetView{offsets[0].ptr(), kMaxBlurSamples*2};

	int i = 0;
	for (int y = -radius.height(); y <= radius.height(); ++y) {
	for (int x = -radius.width(); x <= radius.width(); ++x) {
	offsetView[2*i] = x;
	offsetView[2*i+1] = y;
	++i;
	}
	}
	SkASSERT(i == kernelArea);
	const int lastValidOffset = 2*(kernelArea - 1);
	for (; i < kMaxBlurSamples; ++i) {
	offsetView[2*i] = offsetView[lastValidOffset];
	offsetView[2*i+1] = offsetView[lastValidOffset+1];
	}
	}

	void Compute1DBlurLinearKernel(float sigma,
	int radius,
	std::array<SkV4, kMaxBlurSamples/2>& offsetsAndKernel) {
	SkASSERT(sigma <= kMaxLinearBlurSigma);
	SkASSERT(radius == BlurSigmaRadius(sigma));
	SkASSERT(BlurLinearKernelWidth(radius) <= kMaxBlurSamples);

	// Given 2 adjacent gaussian points, they are blended as: Wi * Ci + Wj * Cj.
	// The GPU will mix Ci and Cj as Ci * (1 - x) + Cj * x during sampling.
	// Compute W', x such that W' * (Ci * (1 - x) + Cj * x) = Wi * Ci + Wj * Cj.
	// Solving W' * x = Wj, W' * (1 - x) = Wi:
	// W' = Wi + Wj
	// x = Wj / (Wi + Wj)
	auto get_new_weight = [](float* new_w, float* offset, float wi, float wj) {
	*new_w = wi + wj;
	*offset = wj / (wi + wj);
	};

	// Create a temporary standard kernel. The maximum blur radius that can be passed to this
	// function is (kMaxBlurSamples-1), so make an array large enough to hold the full kernel width.
	static constexpr int kMaxKernelWidth = BlurKernelWidth(kMaxBlurSamples - 1);
	SkASSERT(BlurKernelWidth(radius) <= kMaxKernelWidth);
	std::array<float, kMaxKernelWidth> fullKernel;
	Compute1DBlurKernel(sigma, radius, SkSpan<float>{fullKernel.data(), BlurKernelWidth(radius)});

	std::array<float, kMaxBlurSamples> kernel;
	std::array<float, kMaxBlurSamples> offsets;
	// Note that halfsize isn't just size / 2, but radius + 1. This is the size of the output array.
	int halfSize = skgpu::BlurLinearKernelWidth(radius);
	int halfRadius = halfSize / 2;
	int lowIndex = halfRadius - 1;

	// Compute1DGaussianKernel produces a full 2N + 1 kernel. Since the kernel can be mirrored,
	// compute only the upper half and mirror to the lower half.

	int index = radius;
	if (radius & 1) {
	// If N is odd, then use two samples.
	// The centre texel gets sampled twice, so halve its influence for each sample.
	// We essentially sample like this:
	// Texel edges
	// v v v v
	// \| \| \| \|
	// \-----^---/ Lower sample
	// \---^-----/ Upper sample
	get_new_weight(&kernel[halfRadius],
	&offsets[halfRadius],
	fullKernel[index] * 0.5f,
	fullKernel[index + 1]);
	kernel[lowIndex] = kernel[halfRadius];
	offsets[lowIndex] = -offsets[halfRadius];
	index++;
	lowIndex--;
	} else {
	// If N is even, then there are an even number of texels on either side of the centre texel.
	// Sample the centre texel directly.
	kernel[halfRadius] = fullKernel[index];
	offsets[halfRadius] = 0.0f;
	}
	index++;

	// Every other pair gets one sample.
	for (int i = halfRadius + 1; i < halfSize; index += 2, i++, lowIndex--) {
	get_new_weight(&kernel[i], &offsets[i], fullKernel[index], fullKernel[index + 1]);
	offsets[i] += static_cast<float>(index - radius);

	// Mirror to lower half.
	kernel[lowIndex] = kernel[i];
	offsets[lowIndex] = -offsets[i];
	}

	// Zero out remaining values in the kernel
	memset(kernel.data() + halfSize, 0, sizeof(float)*(kMaxBlurSamples - halfSize));
	// But copy the last valid offset into the remaining offsets, to increase the chance that
	// over-iteration in a fragment shader will have a cache hit.
	for (int i = halfSize; i < kMaxBlurSamples; ++i) {
	offsets[i] = offsets[halfSize - 1];
	}

	// Interleave into the output array to match the 1D SkSL effect
	for (int i = 0; i < skgpu::kMaxBlurSamples / 2; ++i) {
	offsetsAndKernel[i] = SkV4{offsets[2i], kernel[2i], offsets[2i+1], kernel[2i+1]};
	}
	}

	static SkKnownRuntimeEffects::StableKey to_stablekey(int kernelWidth, uint32_t baseKey) {
	SkASSERT(kernelWidth >= 2 && kernelWidth <= kMaxBlurSamples);
	switch(kernelWidth) {
	// Batch on multiples of 4 (skipping width=1, since that can't happen)
	case 2: [[fallthrough]];
	case 3: [[fallthrough]];
	case 4: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey);
	case 5: [[fallthrough]];
	case 6: [[fallthrough]];
	case 7: [[fallthrough]];
	case 8: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+1);
	case 9: [[fallthrough]];
	case 10: [[fallthrough]];
	case 11: [[fallthrough]];
	case 12: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+2);
	case 13: [[fallthrough]];
	case 14: [[fallthrough]];
	case 15: [[fallthrough]];
	case 16: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+3);
	case 17: [[fallthrough]];
	case 18: [[fallthrough]];
	case 19: [[fallthrough]];
	// With larger kernels, batch on multiples of eight so up to 7 wasted samples.
	case 20: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+4);
	case 21: [[fallthrough]];
	case 22: [[fallthrough]];
	case 23: [[fallthrough]];
	case 24: [[fallthrough]];
	case 25: [[fallthrough]];
	case 26: [[fallthrough]];
	case 27: [[fallthrough]];
	case 28: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+5);
	default:
	SkUNREACHABLE;
	}
	}

	const SkRuntimeEffect* GetLinearBlur1DEffect(int radius) {
	return GetKnownRuntimeEffect(
	to_stablekey(BlurLinearKernelWidth(radius),
	static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k1DBlurBase)));
	}

	const SkRuntimeEffect* GetBlur2DEffect(const SkISize& radii) {
	int kernelArea = BlurKernelWidth(radii.width()) * BlurKernelWidth(radii.height());
	return GetKnownRuntimeEffect(
	to_stablekey(kernelArea,
	static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k2DBlurBase)));
	}

	///////////////////////////////////////////////////////////////////////////////
	// Rect Blur
	///////////////////////////////////////////////////////////////////////////////

	// TODO: it seems like there should be some synergy with SkBlurMask::ComputeBlurProfile
	// TODO: maybe cache this on the cpu side?
	SkBitmap CreateIntegralTable(float sixSigma) {
	SkBitmap table;

	int width = ComputeIntegralTableWidth(sixSigma);
	if (width == 0) {
	return table;
	}

	if (!table.tryAllocPixels(SkImageInfo::MakeA8(width, 1))) {
	return table;
	}
	*table.getAddr8(0, 0) = 255;
	const float invWidth = 1.f / width;
	for (int i = 1; i < width - 1; ++i) {
	float x = (i + 0.5f) * invWidth;
	x = (-6 * x + 3) * SK_ScalarRoot2Over2;
	float integral = 0.5f * (std::erf(x) + 1.f);
	table.getAddr8(i, 0) = SkToU8(sk_float_round2int(255.f integral));
	}

	*table.getAddr8(width - 1, 0) = 0;
	table.setImmutable();
	return table;
	}

	int ComputeIntegralTableWidth(float sixSigma) {
	// Check for NaN
	if (std::isnan(sixSigma)) {
	return 0;
	}
	// Avoid overflow, covers both multiplying by 2 and finding next power of 2:
	// 2((2^31-1)/4 + 1) = 2(2^29-1) + 2 = 2^30 and SkNextPow2(2^30) = 2^30
	if (sixSigma > SK_MaxS32 / 4 + 1) {
	return 0;
	}
	// The texture we're producing represents the integral of a normal distribution over a
	// six-sigma range centered at zero. We want enough resolution so that the linear
	// interpolation done in texture lookup doesn't introduce noticeable artifacts. We
	// conservatively choose to have 2 texels for each dst pixel.
	int minWidth = 2 * ((int)std::ceil(sixSigma));
	// Bin by powers of 2 with a minimum so we get good profile reuse.
	return std::max(SkNextPow2(minWidth), 32);
	}

	///////////////////////////////////////////////////////////////////////////////
	// Circle Blur
	///////////////////////////////////////////////////////////////////////////////

	// Computes an unnormalized half kernel (right side). Returns the summation of all the half
	// kernel values.
	static float make_unnormalized_half_kernel(float* halfKernel, int halfKernelSize, float sigma) {
	const float invSigma = 1.0f / sigma;
	const float b = -0.5f * invSigma * invSigma;
	float tot = 0.0f;
	// Compute half kernel values at half pixel steps out from the center.
	float t = 0.5f;
	for (int i = 0; i < halfKernelSize; ++i) {
	float value = expf(t * t * b);
	tot += value;
	halfKernel[i] = value;
	t += 1.0f;
	}
	return tot;
	}

	// Create a Gaussian half-kernel (right side) and a summed area table given a sigma and number
	// of discrete steps. The half kernel is normalized to sum to 0.5.
	static void make_half_kernel_and_summed_table(float* halfKernel,
	float* summedHalfKernel,
	int halfKernelSize,
	float sigma) {
	// The half kernel should sum to 0.5 not 1.0.
	const float tot = 2.0f * make_unnormalized_half_kernel(halfKernel, halfKernelSize, sigma);
	float sum = 0.0f;
	for (int i = 0; i < halfKernelSize; ++i) {
	halfKernel[i] /= tot;
	sum += halfKernel[i];
	summedHalfKernel[i] = sum;
	}
	}

	// Applies the 1D half kernel vertically at points along the x axis to a circle centered at the
	// origin with radius circleR.
	static void apply_kernel_in_y(float* results,
	int numSteps,
	float firstX,
	float circleR,
	int halfKernelSize,
	const float* summedHalfKernelTable) {
	float x = firstX;
	for (int i = 0; i < numSteps; ++i, x += 1.0f) {
	if (x < -circleR \|\| x > circleR) {
	results[i] = 0;
	continue;
	}
	float y = sqrtf(circleR * circleR - x * x);
	// In the column at x we exit the circle at +y and -y
	// The summed table entry j is actually reflects an offset of j + 0.5.
	y -= 0.5f;
	int yInt = SkScalarFloorToInt(y);
	SkASSERT(yInt >= -1);
	if (y < 0) {
	results[i] = (y + 0.5f) * summedHalfKernelTable[0];
	} else if (yInt >= halfKernelSize - 1) {
	results[i] = 0.5f;
	} else {
	float yFrac = y - yInt;
	results[i] = (1.0f - yFrac) * summedHalfKernelTable[yInt] +
	yFrac * summedHalfKernelTable[yInt + 1];
	}
	}
	}

	// Apply a Gaussian at point (evalX, 0) to a circle centered at the origin with radius circleR.
	// This relies on having a half kernel computed for the Gaussian and a table of applications of
	// the half kernel in y to columns at (evalX - halfKernel, evalX - halfKernel + 1, ..., evalX +
	// halfKernel) passed in as yKernelEvaluations.
	static uint8_t eval_at(float evalX,
	float circleR,
	const float* halfKernel,
	int halfKernelSize,
	const float* yKernelEvaluations) {
	float acc = 0;

	float x = evalX - halfKernelSize;
	for (int i = 0; i < halfKernelSize; ++i, x += 1.0f) {
	if (x < -circleR \|\| x > circleR) {
	continue;
	}
	float verticalEval = yKernelEvaluations[i];
	acc += verticalEval * halfKernel[halfKernelSize - i - 1];
	}
	for (int i = 0; i < halfKernelSize; ++i, x += 1.0f) {
	if (x < -circleR \|\| x > circleR) {
	continue;
	}
	float verticalEval = yKernelEvaluations[i + halfKernelSize];
	acc += verticalEval * halfKernel[i];
	}
	// Since we applied a half kernel in y we multiply acc by 2 (the circle is symmetric about
	// the x axis).
	return SkUnitScalarClampToByte(2.0f * acc);
	}

	// This function creates a profile of a blurred circle. It does this by computing a kernel for
	// half the Gaussian and a matching summed area table. The summed area table is used to compute
	// an array of vertical applications of the half kernel to the circle along the x axis. The
	// table of y evaluations has 2 * k + n entries where k is the size of the half kernel and n is
	// the size of the profile being computed. Then for each of the n profile entries we walk out k
	// steps in each horizontal direction multiplying the corresponding y evaluation by the half
	// kernel entry and sum these values to compute the profile entry.
	SkBitmap CreateCircleProfile(float sigma, float radius, int profileWidth) {
	SkBitmap bitmap;
	if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
	return bitmap;
	}

	uint8_t* profile = bitmap.getAddr8(0, 0);

	const int numSteps = profileWidth;

	// The full kernel is 6 sigmas wide.
	int halfKernelSize = SkScalarCeilToInt(6.0f * sigma);
	// Round up to next multiple of 2 and then divide by 2.
	halfKernelSize = ((halfKernelSize + 1) & ~1) >> 1;

	// Number of x steps at which to apply kernel in y to cover all the profile samples in x.
	const int numYSteps = numSteps + 2 * halfKernelSize;

	skia_private::AutoTArray<float> bulkAlloc(halfKernelSize + halfKernelSize + numYSteps);
	float* halfKernel = bulkAlloc.get();
	float* summedKernel = bulkAlloc.get() + halfKernelSize;
	float* yEvals = bulkAlloc.get() + 2 * halfKernelSize;
	make_half_kernel_and_summed_table(halfKernel, summedKernel, halfKernelSize, sigma);

	float firstX = -halfKernelSize + 0.5f;
	apply_kernel_in_y(yEvals, numYSteps, firstX, radius, halfKernelSize, summedKernel);

	for (int i = 0; i < numSteps - 1; ++i) {
	float evalX = i + 0.5f;
	profile[i] = eval_at(evalX, radius, halfKernel, halfKernelSize, yEvals + i);
	}
	// Ensure the tail of the Gaussian goes to zero.
	profile[numSteps - 1] = 0;

	return bitmap;
	}

	SkBitmap CreateHalfPlaneProfile(int profileWidth) {
	SkASSERT(!(profileWidth & 0x1));

	SkBitmap bitmap;
	if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
	return bitmap;
	}

	uint8_t* profile = bitmap.getAddr8(0, 0);

	// The full kernel is 6 sigmas wide.
	const float sigma = profileWidth / 6.0f;
	const int halfKernelSize = profileWidth / 2;

	skia_private::AutoTArray<float> halfKernel(halfKernelSize);

	// The half kernel should sum to 0.5.
	const float tot = 2.0f * make_unnormalized_half_kernel(halfKernel.get(), halfKernelSize, sigma);
	float sum = 0.0f;
	// Populate the profile from the right edge to the middle.
	for (int i = 0; i < halfKernelSize; ++i) {
	halfKernel[halfKernelSize - i - 1] /= tot;
	sum += halfKernel[halfKernelSize - i - 1];
	profile[profileWidth - i - 1] = SkUnitScalarClampToByte(sum);
	}
	// Populate the profile from the middle to the left edge (by flipping the half kernel and
	// continuing the summation).
	for (int i = 0; i < halfKernelSize; ++i) {
	sum += halfKernel[i];
	profile[halfKernelSize - i - 1] = SkUnitScalarClampToByte(sum);
	}
	// Ensure the tail of the Gaussian goes to zero.
	profile[profileWidth - 1] = 0;

	return bitmap;
	}

	} // namespace skgpu