blob: afbac5f0f0176924a1ceaeb54551752ec15ae7d7 [file] [log] [blame]
/*
* Copyright 2023 Google LLC
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "src/gpu/BlurUtils.h"
#include "include/core/SkBitmap.h"
#include "include/core/SkColorPriv.h"
#include "include/core/SkImageInfo.h"
#include "include/core/SkM44.h"
#include "include/core/SkScalar.h"
#include "include/core/SkSize.h"
#include "include/private/base/SkAssert.h"
#include "include/private/base/SkMath.h"
#include "include/private/base/SkTemplates.h"
#include "include/private/base/SkTo.h"
#include "src/base/SkMathPriv.h"
#include "src/core/SkKnownRuntimeEffects.h"
#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
#include <cstring>
namespace skgpu {
void Compute2DBlurKernel(SkSize sigma,
SkISize radius,
SkSpan<float> kernel) {
// Callers likely had to calculate the radius prior to filling out the kernel value, which is
// why it's provided; but make sure it's consistent with expectations.
SkASSERT(BlurSigmaRadius(sigma.width()) == radius.width() &&
BlurSigmaRadius(sigma.height()) == radius.height());
// Callers are responsible for downscaling large sigmas to values that can be processed by the
// effects, so ensure the radius won't overflow 'kernel'
const int width = BlurKernelWidth(radius.width());
const int height = BlurKernelWidth(radius.height());
const size_t kernelSize = SkTo<size_t>(sk_64_mul(width, height));
SkASSERT(kernelSize <= kernel.size());
// And the definition of an identity blur should be sufficient that 2sigma^2 isn't near zero
// when there's a non-trivial radius.
const float twoSigmaSqrdX = 2.0f * sigma.width() * sigma.width();
const float twoSigmaSqrdY = 2.0f * sigma.height() * sigma.height();
SkASSERT((radius.width() == 0 || !SkScalarNearlyZero(twoSigmaSqrdX)) &&
(radius.height() == 0 || !SkScalarNearlyZero(twoSigmaSqrdY)));
// Setting the denominator to 1 when the radius is 0 automatically converts the remaining math
// to the 1D Gaussian distribution. When both radii are 0, it correctly computes a weight of 1.0
const float sigmaXDenom = radius.width() > 0 ? 1.0f / twoSigmaSqrdX : 1.f;
const float sigmaYDenom = radius.height() > 0 ? 1.0f / twoSigmaSqrdY : 1.f;
float sum = 0.0f;
for (int x = 0; x < width; x++) {
float xTerm = static_cast<float>(x - radius.width());
xTerm = xTerm * xTerm * sigmaXDenom;
for (int y = 0; y < height; y++) {
float yTerm = static_cast<float>(y - radius.height());
float xyTerm = std::exp(-(xTerm + yTerm * yTerm * sigmaYDenom));
// Note that the constant term (1/(sqrt(2*pi*sigma^2)) of the Gaussian
// is dropped here, since we renormalize the kernel below.
kernel[y * width + x] = xyTerm;
sum += xyTerm;
}
}
// Normalize the kernel
float scale = 1.0f / sum;
for (size_t i = 0; i < kernelSize; ++i) {
kernel[i] *= scale;
}
// Zero remainder of the array
memset(kernel.data() + kernelSize, 0, sizeof(float)*(kernel.size() - kernelSize));
}
void Compute2DBlurKernel(SkSize sigma,
SkISize radii,
std::array<SkV4, kMaxBlurSamples/4>& kernel) {
static_assert(sizeof(kernel) == sizeof(std::array<float, kMaxBlurSamples>));
static_assert(alignof(float) == alignof(SkV4));
float* data = kernel[0].ptr();
Compute2DBlurKernel(sigma, radii, SkSpan<float>(data, kMaxBlurSamples));
}
void Compute2DBlurOffsets(SkISize radius, std::array<SkV4, kMaxBlurSamples/2>& offsets) {
const int kernelArea = BlurKernelWidth(radius.width()) * BlurKernelWidth(radius.height());
SkASSERT(kernelArea <= kMaxBlurSamples);
SkSpan<float> offsetView{offsets[0].ptr(), kMaxBlurSamples*2};
int i = 0;
for (int y = -radius.height(); y <= radius.height(); ++y) {
for (int x = -radius.width(); x <= radius.width(); ++x) {
offsetView[2*i] = x;
offsetView[2*i+1] = y;
++i;
}
}
SkASSERT(i == kernelArea);
const int lastValidOffset = 2*(kernelArea - 1);
for (; i < kMaxBlurSamples; ++i) {
offsetView[2*i] = offsetView[lastValidOffset];
offsetView[2*i+1] = offsetView[lastValidOffset+1];
}
}
void Compute1DBlurLinearKernel(float sigma,
int radius,
std::array<SkV4, kMaxBlurSamples/2>& offsetsAndKernel) {
SkASSERT(sigma <= kMaxLinearBlurSigma);
SkASSERT(radius == BlurSigmaRadius(sigma));
SkASSERT(BlurLinearKernelWidth(radius) <= kMaxBlurSamples);
// Given 2 adjacent gaussian points, they are blended as: Wi * Ci + Wj * Cj.
// The GPU will mix Ci and Cj as Ci * (1 - x) + Cj * x during sampling.
// Compute W', x such that W' * (Ci * (1 - x) + Cj * x) = Wi * Ci + Wj * Cj.
// Solving W' * x = Wj, W' * (1 - x) = Wi:
// W' = Wi + Wj
// x = Wj / (Wi + Wj)
auto get_new_weight = [](float* new_w, float* offset, float wi, float wj) {
*new_w = wi + wj;
*offset = wj / (wi + wj);
};
// Create a temporary standard kernel. The maximum blur radius that can be passed to this
// function is (kMaxBlurSamples-1), so make an array large enough to hold the full kernel width.
static constexpr int kMaxKernelWidth = BlurKernelWidth(kMaxBlurSamples - 1);
SkASSERT(BlurKernelWidth(radius) <= kMaxKernelWidth);
std::array<float, kMaxKernelWidth> fullKernel;
Compute1DBlurKernel(sigma, radius, SkSpan<float>{fullKernel.data(), BlurKernelWidth(radius)});
std::array<float, kMaxBlurSamples> kernel;
std::array<float, kMaxBlurSamples> offsets;
// Note that halfsize isn't just size / 2, but radius + 1. This is the size of the output array.
int halfSize = skgpu::BlurLinearKernelWidth(radius);
int halfRadius = halfSize / 2;
int lowIndex = halfRadius - 1;
// Compute1DGaussianKernel produces a full 2N + 1 kernel. Since the kernel can be mirrored,
// compute only the upper half and mirror to the lower half.
int index = radius;
if (radius & 1) {
// If N is odd, then use two samples.
// The centre texel gets sampled twice, so halve its influence for each sample.
// We essentially sample like this:
// Texel edges
// v v v v
// | | | |
// \-----^---/ Lower sample
// \---^-----/ Upper sample
get_new_weight(&kernel[halfRadius],
&offsets[halfRadius],
fullKernel[index] * 0.5f,
fullKernel[index + 1]);
kernel[lowIndex] = kernel[halfRadius];
offsets[lowIndex] = -offsets[halfRadius];
index++;
lowIndex--;
} else {
// If N is even, then there are an even number of texels on either side of the centre texel.
// Sample the centre texel directly.
kernel[halfRadius] = fullKernel[index];
offsets[halfRadius] = 0.0f;
}
index++;
// Every other pair gets one sample.
for (int i = halfRadius + 1; i < halfSize; index += 2, i++, lowIndex--) {
get_new_weight(&kernel[i], &offsets[i], fullKernel[index], fullKernel[index + 1]);
offsets[i] += static_cast<float>(index - radius);
// Mirror to lower half.
kernel[lowIndex] = kernel[i];
offsets[lowIndex] = -offsets[i];
}
// Zero out remaining values in the kernel
memset(kernel.data() + halfSize, 0, sizeof(float)*(kMaxBlurSamples - halfSize));
// But copy the last valid offset into the remaining offsets, to increase the chance that
// over-iteration in a fragment shader will have a cache hit.
for (int i = halfSize; i < kMaxBlurSamples; ++i) {
offsets[i] = offsets[halfSize - 1];
}
// Interleave into the output array to match the 1D SkSL effect
for (int i = 0; i < skgpu::kMaxBlurSamples / 2; ++i) {
offsetsAndKernel[i] = SkV4{offsets[2*i], kernel[2*i], offsets[2*i+1], kernel[2*i+1]};
}
}
static SkKnownRuntimeEffects::StableKey to_stablekey(int kernelWidth, uint32_t baseKey) {
SkASSERT(kernelWidth >= 2 && kernelWidth <= kMaxBlurSamples);
switch(kernelWidth) {
// Batch on multiples of 4 (skipping width=1, since that can't happen)
case 2: [[fallthrough]];
case 3: [[fallthrough]];
case 4: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey);
case 5: [[fallthrough]];
case 6: [[fallthrough]];
case 7: [[fallthrough]];
case 8: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+1);
case 9: [[fallthrough]];
case 10: [[fallthrough]];
case 11: [[fallthrough]];
case 12: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+2);
case 13: [[fallthrough]];
case 14: [[fallthrough]];
case 15: [[fallthrough]];
case 16: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+3);
case 17: [[fallthrough]];
case 18: [[fallthrough]];
case 19: [[fallthrough]];
// With larger kernels, batch on multiples of eight so up to 7 wasted samples.
case 20: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+4);
case 21: [[fallthrough]];
case 22: [[fallthrough]];
case 23: [[fallthrough]];
case 24: [[fallthrough]];
case 25: [[fallthrough]];
case 26: [[fallthrough]];
case 27: [[fallthrough]];
case 28: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+5);
default:
SkUNREACHABLE;
}
}
const SkRuntimeEffect* GetLinearBlur1DEffect(int radius) {
return GetKnownRuntimeEffect(
to_stablekey(BlurLinearKernelWidth(radius),
static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k1DBlurBase)));
}
const SkRuntimeEffect* GetBlur2DEffect(const SkISize& radii) {
int kernelArea = BlurKernelWidth(radii.width()) * BlurKernelWidth(radii.height());
return GetKnownRuntimeEffect(
to_stablekey(kernelArea,
static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k2DBlurBase)));
}
///////////////////////////////////////////////////////////////////////////////
// Rect Blur
///////////////////////////////////////////////////////////////////////////////
// TODO: it seems like there should be some synergy with SkBlurMask::ComputeBlurProfile
// TODO: maybe cache this on the cpu side?
SkBitmap CreateIntegralTable(float sixSigma) {
SkBitmap table;
int width = ComputeIntegralTableWidth(sixSigma);
if (width == 0) {
return table;
}
if (!table.tryAllocPixels(SkImageInfo::MakeA8(width, 1))) {
return table;
}
*table.getAddr8(0, 0) = 255;
const float invWidth = 1.f / width;
for (int i = 1; i < width - 1; ++i) {
float x = (i + 0.5f) * invWidth;
x = (-6 * x + 3) * SK_ScalarRoot2Over2;
float integral = 0.5f * (std::erf(x) + 1.f);
*table.getAddr8(i, 0) = SkToU8(sk_float_round2int(255.f * integral));
}
*table.getAddr8(width - 1, 0) = 0;
table.setImmutable();
return table;
}
int ComputeIntegralTableWidth(float sixSigma) {
// Check for NaN
if (std::isnan(sixSigma)) {
return 0;
}
// Avoid overflow, covers both multiplying by 2 and finding next power of 2:
// 2*((2^31-1)/4 + 1) = 2*(2^29-1) + 2 = 2^30 and SkNextPow2(2^30) = 2^30
if (sixSigma > SK_MaxS32 / 4 + 1) {
return 0;
}
// The texture we're producing represents the integral of a normal distribution over a
// six-sigma range centered at zero. We want enough resolution so that the linear
// interpolation done in texture lookup doesn't introduce noticeable artifacts. We
// conservatively choose to have 2 texels for each dst pixel.
int minWidth = 2 * ((int)std::ceil(sixSigma));
// Bin by powers of 2 with a minimum so we get good profile reuse.
return std::max(SkNextPow2(minWidth), 32);
}
///////////////////////////////////////////////////////////////////////////////
// Circle Blur
///////////////////////////////////////////////////////////////////////////////
// Computes an unnormalized half kernel (right side). Returns the summation of all the half
// kernel values.
static float make_unnormalized_half_kernel(float* halfKernel, int halfKernelSize, float sigma) {
const float invSigma = 1.0f / sigma;
const float b = -0.5f * invSigma * invSigma;
float tot = 0.0f;
// Compute half kernel values at half pixel steps out from the center.
float t = 0.5f;
for (int i = 0; i < halfKernelSize; ++i) {
float value = expf(t * t * b);
tot += value;
halfKernel[i] = value;
t += 1.0f;
}
return tot;
}
// Create a Gaussian half-kernel (right side) and a summed area table given a sigma and number
// of discrete steps. The half kernel is normalized to sum to 0.5.
static void make_half_kernel_and_summed_table(float* halfKernel,
float* summedHalfKernel,
int halfKernelSize,
float sigma) {
// The half kernel should sum to 0.5 not 1.0.
const float tot = 2.0f * make_unnormalized_half_kernel(halfKernel, halfKernelSize, sigma);
float sum = 0.0f;
for (int i = 0; i < halfKernelSize; ++i) {
halfKernel[i] /= tot;
sum += halfKernel[i];
summedHalfKernel[i] = sum;
}
}
// Applies the 1D half kernel vertically at points along the x axis to a circle centered at the
// origin with radius circleR.
static void apply_kernel_in_y(float* results,
int numSteps,
float firstX,
float circleR,
int halfKernelSize,
const float* summedHalfKernelTable) {
float x = firstX;
for (int i = 0; i < numSteps; ++i, x += 1.0f) {
if (x < -circleR || x > circleR) {
results[i] = 0;
continue;
}
float y = sqrtf(circleR * circleR - x * x);
// In the column at x we exit the circle at +y and -y
// The summed table entry j is actually reflects an offset of j + 0.5.
y -= 0.5f;
int yInt = SkScalarFloorToInt(y);
SkASSERT(yInt >= -1);
if (y < 0) {
results[i] = (y + 0.5f) * summedHalfKernelTable[0];
} else if (yInt >= halfKernelSize - 1) {
results[i] = 0.5f;
} else {
float yFrac = y - yInt;
results[i] = (1.0f - yFrac) * summedHalfKernelTable[yInt] +
yFrac * summedHalfKernelTable[yInt + 1];
}
}
}
// Apply a Gaussian at point (evalX, 0) to a circle centered at the origin with radius circleR.
// This relies on having a half kernel computed for the Gaussian and a table of applications of
// the half kernel in y to columns at (evalX - halfKernel, evalX - halfKernel + 1, ..., evalX +
// halfKernel) passed in as yKernelEvaluations.
static uint8_t eval_at(float evalX,
float circleR,
const float* halfKernel,
int halfKernelSize,
const float* yKernelEvaluations) {
float acc = 0;
float x = evalX - halfKernelSize;
for (int i = 0; i < halfKernelSize; ++i, x += 1.0f) {
if (x < -circleR || x > circleR) {
continue;
}
float verticalEval = yKernelEvaluations[i];
acc += verticalEval * halfKernel[halfKernelSize - i - 1];
}
for (int i = 0; i < halfKernelSize; ++i, x += 1.0f) {
if (x < -circleR || x > circleR) {
continue;
}
float verticalEval = yKernelEvaluations[i + halfKernelSize];
acc += verticalEval * halfKernel[i];
}
// Since we applied a half kernel in y we multiply acc by 2 (the circle is symmetric about
// the x axis).
return SkUnitScalarClampToByte(2.0f * acc);
}
// This function creates a profile of a blurred circle. It does this by computing a kernel for
// half the Gaussian and a matching summed area table. The summed area table is used to compute
// an array of vertical applications of the half kernel to the circle along the x axis. The
// table of y evaluations has 2 * k + n entries where k is the size of the half kernel and n is
// the size of the profile being computed. Then for each of the n profile entries we walk out k
// steps in each horizontal direction multiplying the corresponding y evaluation by the half
// kernel entry and sum these values to compute the profile entry.
SkBitmap CreateCircleProfile(float sigma, float radius, int profileWidth) {
SkBitmap bitmap;
if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
return bitmap;
}
uint8_t* profile = bitmap.getAddr8(0, 0);
const int numSteps = profileWidth;
// The full kernel is 6 sigmas wide.
int halfKernelSize = SkScalarCeilToInt(6.0f * sigma);
// Round up to next multiple of 2 and then divide by 2.
halfKernelSize = ((halfKernelSize + 1) & ~1) >> 1;
// Number of x steps at which to apply kernel in y to cover all the profile samples in x.
const int numYSteps = numSteps + 2 * halfKernelSize;
skia_private::AutoTArray<float> bulkAlloc(halfKernelSize + halfKernelSize + numYSteps);
float* halfKernel = bulkAlloc.get();
float* summedKernel = bulkAlloc.get() + halfKernelSize;
float* yEvals = bulkAlloc.get() + 2 * halfKernelSize;
make_half_kernel_and_summed_table(halfKernel, summedKernel, halfKernelSize, sigma);
float firstX = -halfKernelSize + 0.5f;
apply_kernel_in_y(yEvals, numYSteps, firstX, radius, halfKernelSize, summedKernel);
for (int i = 0; i < numSteps - 1; ++i) {
float evalX = i + 0.5f;
profile[i] = eval_at(evalX, radius, halfKernel, halfKernelSize, yEvals + i);
}
// Ensure the tail of the Gaussian goes to zero.
profile[numSteps - 1] = 0;
return bitmap;
}
SkBitmap CreateHalfPlaneProfile(int profileWidth) {
SkASSERT(!(profileWidth & 0x1));
SkBitmap bitmap;
if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
return bitmap;
}
uint8_t* profile = bitmap.getAddr8(0, 0);
// The full kernel is 6 sigmas wide.
const float sigma = profileWidth / 6.0f;
const int halfKernelSize = profileWidth / 2;
skia_private::AutoTArray<float> halfKernel(halfKernelSize);
// The half kernel should sum to 0.5.
const float tot = 2.0f * make_unnormalized_half_kernel(halfKernel.get(), halfKernelSize, sigma);
float sum = 0.0f;
// Populate the profile from the right edge to the middle.
for (int i = 0; i < halfKernelSize; ++i) {
halfKernel[halfKernelSize - i - 1] /= tot;
sum += halfKernel[halfKernelSize - i - 1];
profile[profileWidth - i - 1] = SkUnitScalarClampToByte(sum);
}
// Populate the profile from the middle to the left edge (by flipping the half kernel and
// continuing the summation).
for (int i = 0; i < halfKernelSize; ++i) {
sum += halfKernel[i];
profile[halfKernelSize - i - 1] = SkUnitScalarClampToByte(sum);
}
// Ensure the tail of the Gaussian goes to zero.
profile[profileWidth - 1] = 0;
return bitmap;
}
} // namespace skgpu