blob: 4464a92feed1597a3520812c53973ee418aaf1b3 [file] [log] [blame]
/*
* Copyright 2026 Google LLC
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "src/gpu/graphite/TextureFormatXferFn.h"
#include "include/core/SkColorType.h"
#include "src/base/SkAutoMalloc.h"
#include "src/base/SkFloatBits.h"
#include "src/base/SkHalf.h"
#include "src/base/SkMathPriv.h"
#include "src/base/SkVx.h"
#include "src/core/SkColorSpaceXformSteps.h"
#include "src/core/SkImageInfoPriv.h"
#include "src/core/SkRasterPipeline.h"
#include "src/core/SkRasterPipelineOpContexts.h"
#include "src/gpu/graphite/Log.h"
#include <functional>
namespace skgpu::graphite {
namespace {
using TF = TextureFormat;
// This is intentionally not a class enum and is distinct from FormatXferOps for two reasons:
// 1. We can't use SkEnumBitmask inside template parameters for the row bit manipulation functions.
// 2. It's convenient to split the FormatXferOps into specific ops based on their conversion
// direction that doesn't need to be exposed in the public API.
enum ExtendedFormatXferOp : uint8_t {
kDropAlpha = 0x1,
kPadAlpha = 0x2,
};
using XferRowFn = std::function<void(const char* src, char* dst, int width)>;
template <typename PxVec, typename PixelFn /* [](PxVec) -> PxVec */>
XferRowFn create_xfer_row_fn(int n, int srcBpp, int dstBpp, PixelFn applyPixel) {
// PxVec should be sufficient to hold N src and dst pixels, and should match at least one
SkASSERT(sizeof(PxVec) >= (size_t)srcBpp*n && sizeof(PxVec) >= (size_t)dstBpp*n);
SkASSERT(sizeof(PxVec) == (size_t)srcBpp*n || sizeof(PxVec) == (size_t)dstBpp*n);
return [n, srcBpp, dstBpp, applyPixel](const char* src, char* dst, int width) {
const int srcBppN = n * srcBpp;
const int dstBppN = n * dstBpp;
PxVec pixel{};
while (width >= n) {
memcpy(&pixel, src, srcBppN);
pixel = applyPixel(pixel);
memcpy(dst, &pixel, dstBppN);
width -= n;
src += srcBppN;
dst += dstBppN;
}
if (width > 0) {
// Process tail that is less than a full vector
SkASSERT(width < n);
memcpy(&pixel, src, width * srcBpp);
pixel = applyPixel(pixel);
memcpy(dst, &pixel, width * dstBpp);
}
};
}
// N represents the number of pixels being processed; the currently supported Ops are only valid for
// a channel count of 3 (upgraded to 4 effectively) or exactly 4.
template <typename Cx, int N, uint8_t Ops, Cx OpaqueAlpha>
skvx::Vec<4*N, Cx> apply_ops_by_channel(skvx::Vec<4*N, Cx> pixel) {
static_assert(N == 1 || N == 2 || N == 4);
if constexpr (Ops & kPadAlpha) {
// If we are padding alpha, we are moving from a 3-channel source data (loaded in the first
// N*3 values of pixel) to a 4-channel value. 3*N+1 holds undefined data after loading the
// pixel, which we can set to the opaque alpha value. Then we shuffle the slots to spread
// out each pixel's R,G, and B values and insert copies of the opaque alpha value.
static constexpr int kA = 3*N;
pixel[kA] = OpaqueAlpha;
if constexpr (N == 4) {
pixel = skvx::shuffle<0,1,2,kA, 3,4,5,kA, 6,7,8,kA, 9,10,11,kA>(pixel);
} else if constexpr (N == 2) {
pixel = skvx::shuffle<0,1,2,kA, 3,4,5,kA>(pixel);
} // else no shuffling needed for N=1, since pixel == shuffle<0,1,2,A>(pixel)
}
// TODO(michaelludwig): Add other extend ops here
if constexpr (Ops & kDropAlpha) {
// If we are dropping alpha, we need to shuffle the R,G, and B values of the 4-channel
// source data into the first N*3 slots of the returned pixel. The remaining N slots will
// be ignored by the final memcpy.
if constexpr (N == 4) {
pixel = skvx::shuffle<0,1,2, 4,5,6, 8,9,10, 12,13,14, 14,14,14,14>(pixel);
} else if constexpr (N == 2) {
pixel = skvx::shuffle<0,1,2, 4,5,6, 6,6>(pixel);
} // else no shuffling needed for N=1, since pixel == shuffle<0,1,2,_>(pixel)
}
return pixel;
}
// NOTE: This takes no parameters for SwapRB or DropAlpha because for all formats that use this,
// SwapRB involves swapping channel 0 and channel 2, and dropping alpha removes channel 3. This
// can be parameterized via template parameters to be able to push into the skvx::shuffle calls
// if needed in the future.
template <typename Cx, Cx OpaqueAlpha>
XferRowFn xfer_rows_by_channel(uint8_t ops) {
static constexpr int C = 3;
static constexpr int CPow2 = SkNextPow2(C);
static constexpr int N = 16 / (CPow2 * sizeof(Cx)); // Fit to 128-bit/16-byte SIMD
static_assert(N == 1 || N == 2 || N == 4);
using PxVec = skvx::Vec<N*CPow2, Cx>;
int srcBpp = C * sizeof(Cx);
int dstBpp = C * sizeof(Cx);
if (ops & kDropAlpha) {
// Going from 4-channel src to the 3-channel format
srcBpp = CPow2 * sizeof(Cx);
return create_xfer_row_fn<PxVec>(N, srcBpp, dstBpp,
apply_ops_by_channel<Cx, N, kDropAlpha, OpaqueAlpha>);
} else if (ops & kPadAlpha) {
// Going from the 3-channel format to 4-channel dst
dstBpp = CPow2 * sizeof(Cx);
return create_xfer_row_fn<PxVec>(N, srcBpp, dstBpp,
apply_ops_by_channel<Cx, N, kPadAlpha, OpaqueAlpha>);
} else {
SKGPU_LOG_F("Identity transfer should have been caught earlier");
}
return nullptr;
}
XferRowFn get_xfer_row_fn(TextureFormat format, uint8_t ops) {
static constexpr uint32_t kFloatBits1 = 0x3f800000; // SkFloat2Bits isn't constexpr
SkASSERT(kFloatBits1 == SkFloat2Bits(1.f));
SkASSERT(ops); // For now, assume we only call into this if we have work to do.
switch (format) {
case TF::kR8:
case TF::kA8:
case TF::kR16:
case TF::kR16F:
case TF::kRG8:
case TF::kRG16:
case TF::kRG16F:
case TF::kRG32F:
// 1 and 2 channel formats cannot be combined with colortypes in such a way to create
// format conversions, so we should never reach here
SKGPU_LOG_F("Unexpected ops (%u) requested for format %s",
ops, TextureFormatName(format));
break;
// Packed formats operate on a primitive that holds the entire pixel value
case TF::kB5_G6_R5:
case TF::kR5_G6_B5:
case TF::kABGR4:
case TF::kARGB4:
case TF::kRGB10_A2:
case TF::kBGR10_A2:
case TF::kBGR10_XR:
// TODO(michaelludwig): These formats could do r/b swaps and forcing-opaque, but
// that isn't implemented yet.
SKGPU_LOG_F("Unsupported texture format %s", TextureFormatName(format));
break;
// The remaining formats can be operated on with each channel as a primitive
case TF::kRGB8_sRGB:
case TF::kRGB8:
case TF::kBGR8:
return xfer_rows_by_channel<uint8_t, 0xFF>(ops);
case TF::kRGB16:
return xfer_rows_by_channel<uint16_t, 0xFFFF>(ops);
case TF::kRGB16F:
return xfer_rows_by_channel<uint16_t, SK_Half1>(ops);
case TF::kRGB32F:
return xfer_rows_by_channel<uint32_t, kFloatBits1>(ops);
case TF::kRGBA8:
case TF::kRGBA8_sRGB:
case TF::kBGRA8:
case TF::kBGRA8_sRGB:
case TF::kRGBA16:
case TF::kRGBA16F:
case TF::kRGBA10x6:
case TF::kBGRA10x6_XR:
case TF::kRGBA32F:
// TODO(michaelludwig): These formats could do r/b swaps and forcing-opaque, but
// that isn't implemented yet.
SKGPU_LOG_F("Unsupported texture format %s", TextureFormatName(format));
break;
default:
// Remaining cases are compressed, multiplanar, or non-color so shouldn't be reached.
// If the first assert trips, we missed a valid transfer format in the cases above.
// If we hit the unreachable, we missed rejecting the transfer sooner.
SkASSERT(TextureFormatColorTypeInfo(format).second & FormatXferOp::kDisabled);
SkUNREACHABLE;
}
}
} // anonymous namespace
std::optional<TextureFormatXferFn> TextureFormatXferFn::MakeCpuToGpu(
SkColorType srcCT,
const SkColorSpaceXformSteps& csSteps,
TextureFormat dstFormat,
Swizzle dstReadSwizzle) {
auto [baseCT, xferOps] = TextureFormatColorTypeInfo(dstFormat);
if (xferOps & FormatXferOp::kDisabled) {
return std::nullopt;
}
uint8_t postOps = 0;
Swizzle srcToDst;
if (dstReadSwizzle == Swizzle::RRRA() &&
baseCT == kR8_unorm_SkColorType &&
srcCT != kGray_8_SkColorType) {
// While we are storing an R8 value, we need to adjust the baseCT in order to induce
// SkRasterPipeline to compute luminance from the non-gray src values.
baseCT = kGray_8_SkColorType;
srcToDst = Swizzle::RGBA();
} else {
srcToDst = dstReadSwizzle.invert();
}
if (xferOps & FormatXferOp::kSwapRB) {
srcToDst = Swizzle::Concat(srcToDst, Swizzle::BGRA());
}
if (xferOps & FormatXferOp::kDropAlpha) {
// On CPU->GPU conversion, FormatXferOp::kDropAlpha actually drops the alpha bits
postOps |= kDropAlpha;
}
auto rp = RPOps::Make(srcCT, baseCT, csSteps, srcToDst);
return TextureFormatXferFn(dstFormat, /*preOps=*/0, std::move(rp), postOps);
}
std::optional<TextureFormatXferFn> TextureFormatXferFn::MakeGpuToCpu(
TextureFormat srcFormat,
Swizzle srcReadSwizzle,
const SkColorSpaceXformSteps& csSteps,
SkColorType dstCT) {
auto [baseCT, xferOps] = TextureFormatColorTypeInfo(srcFormat);
if (xferOps & FormatXferOp::kDisabled) {
return std::nullopt;
}
uint8_t preOps = 0;
Swizzle srcToDst = srcReadSwizzle;
// NOTE: no need to adjust baseCT for red vs. gray in this direction as the ambiguity is
// irrelevant since the read swizzle pushes the "red" value into all three channels, any
// conversion to gray works back out to the original value.
if (xferOps & FormatXferOp::kSwapRB) {
srcToDst = Swizzle::Concat(srcToDst, Swizzle::BGRA());
}
if (xferOps & FormatXferOp::kDropAlpha) {
// On GPU->CPU conversion, FormatXferOp::kDropAlpha must pad alpha bits back
preOps |= kPadAlpha;
}
auto rp = RPOps::Make(baseCT, dstCT, srcToDst, csSteps);
return TextureFormatXferFn(srcFormat, preOps, std::move(rp), /*postOps=*/0);
}
template <typename... RPModifiers>
std::unique_ptr<TextureFormatXferFn::RPOps> TextureFormatXferFn::RPOps::Make(
SkColorType srcColorType,
SkColorType dstColorType,
RPModifiers... rpModifiers) {
if (srcColorType == dstColorType &&
(!SkToBool(rpModifiers) && ...)) {
return nullptr; // Identity conversion
}
std::unique_ptr<RPOps> ops{new RPOps(/*srcBpp=*/SkColorTypeBytesPerPixel(srcColorType),
/*dstBpp=*/SkColorTypeBytesPerPixel(dstColorType))};
// NOTE: The src and dst memory contexts are not modified here, they just provide stable
// pointers for the appended ops to reference, and will be patched during run().
ops->fRP.appendLoad(srcColorType, &ops->fSrcCtx);
// We must create a copy of rpModifiers[i] in the arena, because its apply() function may
// reference parts of itself as the context's passed to the appended raster pipeline ops
(ops->fArena.make<decltype(rpModifiers)>(rpModifiers)->apply(&ops->fRP), ...);
ops->fRP.appendStore(dstColorType, &ops->fDstCtx);
return ops;
}
bool TextureFormatXferFn::RPOps::setStrides(size_t srcRowBytes,
size_t dstRowBytes,
uint8_t otherOps) {
// SkRasterPipeline operates in pixel units for its strides, so we should only be relying on
// RP's built-in row stride handling if the data is aligned to the pixel size.
if (srcRowBytes % fSrcBpp == 0 && dstRowBytes % fDstBpp == 0 && otherOps == 0) {
fSrcCtx.stride = SkTo<int>(srcRowBytes / fSrcBpp);
fDstCtx.stride = SkTo<int>(dstRowBytes / fDstBpp);
return true;
} else {
// Control loop must proceed row by row, so stride can be 0
fSrcCtx.stride = 0;
fDstCtx.stride = 0;
return false;
}
}
// TODO(michaelludwig): This is a WIP implementation, it is not focusing on performance yet.
void TextureFormatXferFn::run(int width, int height,
const void* src, size_t srcRowBytes,
void* dst, size_t dstRowBytes) const {
SkASSERT(width >= 1 && height >= 1);
int rpInvokeCount;
SkAutoMalloc tempRowStorage; // empty if no FormatXferOps have to be applied
if (fRP && fRP->setStrides(srcRowBytes, dstRowBytes, fPreOps | fPostOps)) {
// Conversions occur entirely within SkRasterPipeline, so we can configure the
// MemoryCtx's to process the whole 2D image.
rpInvokeCount = 1;
} else {
// Conversions will have to occur row-by-row. The SkRP row function will patch the
// memory contexts to each row's offset address so we can leave stride as 0.
SkASSERT(!fRP || (fRP->fSrcCtx.stride == 0 && fRP->fDstCtx.stride == 0));
rpInvokeCount = height;
height = 1;
}
skia_private::STArray<2, XferRowFn> rowFns; // At most 2 actions per row
if (fPreOps) {
// `src` is definitively the texture
if (fRP) {
// We need a temporary buffer equal to srcBpp*width to hold the output of the preOps
// that is used as the source of data for SkRasterPipeline (executed per row).
tempRowStorage.reset(fRP->fSrcBpp * width);
}
rowFns.push_back(get_xfer_row_fn(fFormat, fPreOps));
}
if (fRP) {
rowFns.push_back([&](const char* src, char* dst, int width) {
// NOTE: When height != 1, this invocation actually processes the entire image.
// Otherwise we assume src and dst have been offset by y so we update the MemoryCtx's
// pixel addresses.
fRP->fSrcCtx.pixels = const_cast<char*>(src); // This won't be written to
fRP->fDstCtx.pixels = dst;
fRP->fRP.run(0, 0, width, height);
});
}
if (fPostOps) {
// `dst` is definitively the texture
if (fRP) {
// We need a temporary buffer equal to dstBpp*width to hold the output of the
// SkRasterPipeline conversion that is used as the input to postOps (executed per row).
tempRowStorage.reset(fRP->fDstBpp * width);
}
rowFns.push_back(get_xfer_row_fn(fFormat, fPostOps));
}
if (rowFns.empty()) {
// Identity conversion function still needs to move the data
const int bpp = TextureFormatBytesPerBlock(fFormat);
rowFns.push_back([bpp](const char* src, char* dst, int width) {
memcpy(dst, src, bpp * width);
});
}
for (int y = 0; y < rpInvokeCount; ++y) {
// Always start by processing `src`
const char* input = static_cast<const char*>(src) + y * srcRowBytes;
for (int i = 0; i < rowFns.size(); ++i) {
// And either output to the temporary row or the final `dst`
char* target = i == rowFns.size() - 1 ? (static_cast<char*>(dst) + y * dstRowBytes)
: static_cast<char*>(tempRowStorage.get());
rowFns[i](input, target, width);
// If there's more than one rowFn, switch to using the temporary row as input
input = target;
}
}
}
} // namespace skgpu::graphite