metal: SDL_RenderFillRects uses one draw call per 16k rectangles (within the given FillRects call), instead of one draw call per rectangle. Reduces CPU usage when drawing many rectangles.
--HG--
extra : amend_source : 1b39afd4eaa31c151f607e4685f079a46fa6238c
diff --git a/src/render/metal/SDL_render_metal.m b/src/render/metal/SDL_render_metal.m
index 06e4ef6..1e9a2e8 100644
--- a/src/render/metal/SDL_render_metal.m
+++ b/src/render/metal/SDL_render_metal.m
@@ -117,6 +117,7 @@
@property (nonatomic, retain) id<MTLSamplerState> mtlsamplernearest;
@property (nonatomic, retain) id<MTLSamplerState> mtlsamplerlinear;
@property (nonatomic, retain) id<MTLBuffer> mtlbufconstants;
+ @property (nonatomic, retain) id<MTLBuffer> mtlbufquadindices;
@property (nonatomic, retain) CAMetalLayer *mtllayer;
@property (nonatomic, retain) MTLRenderPassDescriptor *mtlpassdesc;
@property (nonatomic, assign) METAL_ShaderPipelines *activepipelines;
@@ -137,6 +138,7 @@
[_mtlsamplernearest release];
[_mtlsamplerlinear release];
[_mtlbufconstants release];
+ [_mtlbufquadindices release];
[_mtllayer release];
[_mtlpassdesc release];
[super dealloc];
@@ -794,7 +796,6 @@
static int
METAL_QueueFillRects(SDL_Renderer * renderer, SDL_RenderCommand *cmd, const SDL_FRect * rects, int count)
{
- // !!! FIXME: use an index buffer
const size_t vertlen = (sizeof (float) * 8) * count;
float *verts = (float *) SDL_AllocateRenderVertices(renderer, vertlen, 0, &cmd->data.draw.first);
if (!verts) {
@@ -803,6 +804,11 @@
cmd->data.draw.count = count;
+ /* Quads in the following vertex order (matches the quad index buffer):
+ * 1---3
+ * | \ |
+ * 0---2
+ */
for (int i = 0; i < count; i++, rects++) {
if ((rects->w <= 0.0f) || (rects->h <= 0.0f)) {
cmd->data.draw.count--;
@@ -829,9 +835,8 @@
METAL_QueueCopy(SDL_Renderer * renderer, SDL_RenderCommand *cmd, SDL_Texture * texture,
const SDL_Rect * srcrect, const SDL_FRect * dstrect)
{
- METAL_TextureData *texturedata = (__bridge METAL_TextureData *)texture->driverdata;
- const float texw = (float) texturedata.mtltexture.width;
- const float texh = (float) texturedata.mtltexture.height;
+ const float texw = (float) texture->w;
+ const float texh = (float) texture->h;
// !!! FIXME: use an index buffer
const size_t vertlen = (sizeof (float) * 16);
float *verts = (float *) SDL_AllocateRenderVertices(renderer, vertlen, 0, &cmd->data.draw.first);
@@ -867,9 +872,8 @@
const SDL_Rect * srcquad, const SDL_FRect * dstrect,
const double angle, const SDL_FPoint *center, const SDL_RendererFlip flip)
{
- METAL_TextureData *texturedata = (__bridge METAL_TextureData *)texture->driverdata;
- const float texw = (float) texturedata.mtltexture.width;
- const float texh = (float) texturedata.mtltexture.height;
+ const float texw = (float) texture->w;
+ const float texh = (float) texture->h;
const float rads = (float)(M_PI * (float) angle / 180.0f);
const float c = cosf(rads), s = sinf(rads);
float minu, maxu, minv, maxv;
@@ -1159,10 +1163,19 @@
case SDL_RENDERCMD_FILL_RECTS: {
const size_t count = cmd->data.draw.count;
- size_t start = 0;
+ const size_t maxcount = UINT16_MAX / 6;
SetDrawState(renderer, cmd, SDL_METAL_FRAGMENT_SOLID, CONSTANTS_OFFSET_IDENTITY, mtlbufvertex, &statecache);
- for (size_t i = 0; i < count; i++, start += 4) { // !!! FIXME: can we do all of these this with a single draw call, using MTLPrimitiveTypeTriangle and an index buffer?
- [data.mtlcmdencoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:start vertexCount:4];
+ /* Our index buffer has 16 bit indices, so we can only draw 65k
+ * vertices (16k rects) at a time. */
+ for (size_t i = 0; i < count; i += maxcount) {
+ /* Set the vertex buffer offset for our current positions.
+ * The vertex buffer itself was bound in SetDrawState. */
+ [data.mtlcmdencoder setVertexBufferOffset:cmd->data.draw.first + i*sizeof(float)*8 atIndex:0];
+ [data.mtlcmdencoder drawIndexedPrimitives:MTLPrimitiveTypeTriangle
+ indexCount:SDL_min(maxcount, count - i) * 6
+ indexType:MTLIndexTypeUInt16
+ indexBuffer:data.mtlbufquadindices
+ indexBufferOffset:0];
}
break;
}
@@ -1424,11 +1437,6 @@
#if !__has_feature(objc_arc)
[mtlbufconstantstaging autorelease];
#endif
- mtlbufconstantstaging.label = @"SDL constant staging data";
-
- id<MTLBuffer> mtlbufconstants = [data.mtldevice newBufferWithLength:CONSTANTS_LENGTH options:MTLResourceStorageModePrivate];
- data.mtlbufconstants = mtlbufconstants;
- data.mtlbufconstants.label = @"SDL constant data";
char *constantdata = [mtlbufconstantstaging contents];
SDL_memcpy(constantdata + CONSTANTS_OFFSET_IDENTITY, identitytransform, sizeof(identitytransform));
@@ -1437,10 +1445,42 @@
SDL_memcpy(constantdata + CONSTANTS_OFFSET_DECODE_BT601, decodetransformBT601, sizeof(decodetransformBT601));
SDL_memcpy(constantdata + CONSTANTS_OFFSET_DECODE_BT709, decodetransformBT709, sizeof(decodetransformBT709));
+ int quadcount = UINT16_MAX / 4;
+ size_t indicessize = sizeof(UInt16) * quadcount * 6;
+ id<MTLBuffer> mtlbufquadindicesstaging = [data.mtldevice newBufferWithLength:indicessize options:MTLResourceStorageModeShared];
+#if !__has_feature(objc_arc)
+ [mtlbufquadindicesstaging autorelease];
+#endif
+
+ /* Quads in the following vertex order (matches the FillRects vertices):
+ * 1---3
+ * | \ |
+ * 0---2
+ */
+ UInt16 *indexdata = [mtlbufquadindicesstaging contents];
+ for (int i = 0; i < quadcount; i++) {
+ indexdata[i * 6 + 0] = i * 4 + 0;
+ indexdata[i * 6 + 1] = i * 4 + 1;
+ indexdata[i * 6 + 2] = i * 4 + 2;
+
+ indexdata[i * 6 + 3] = i * 4 + 2;
+ indexdata[i * 6 + 4] = i * 4 + 1;
+ indexdata[i * 6 + 5] = i * 4 + 3;
+ }
+
+ id<MTLBuffer> mtlbufconstants = [data.mtldevice newBufferWithLength:CONSTANTS_LENGTH options:MTLResourceStorageModePrivate];
+ data.mtlbufconstants = mtlbufconstants;
+ data.mtlbufconstants.label = @"SDL constant data";
+
+ id<MTLBuffer> mtlbufquadindices = [data.mtldevice newBufferWithLength:indicessize options:MTLResourceStorageModePrivate];
+ data.mtlbufquadindices = mtlbufquadindices;
+ data.mtlbufquadindices.label = @"SDL quad index buffer";
+
id<MTLCommandBuffer> cmdbuffer = [data.mtlcmdqueue commandBuffer];
id<MTLBlitCommandEncoder> blitcmd = [cmdbuffer blitCommandEncoder];
- [blitcmd copyFromBuffer:mtlbufconstantstaging sourceOffset:0 toBuffer:data.mtlbufconstants destinationOffset:0 size:CONSTANTS_LENGTH];
+ [blitcmd copyFromBuffer:mtlbufconstantstaging sourceOffset:0 toBuffer:mtlbufconstants destinationOffset:0 size:CONSTANTS_LENGTH];
+ [blitcmd copyFromBuffer:mtlbufquadindicesstaging sourceOffset:0 toBuffer:mtlbufquadindices destinationOffset:0 size:indicessize];
[blitcmd endEncoding];
[cmdbuffer commit];
@@ -1503,8 +1543,10 @@
#endif
#else
#ifdef __IPHONE_11_0
- if ([mtldevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily4_v1]) {
- maxtexsize = 16384;
+ if (@available(iOS 11.0, *)) {
+ if ([mtldevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily4_v1]) {
+ maxtexsize = 16384;
+ }
} else
#endif
#ifdef __IPHONE_10_0
@@ -1529,6 +1571,7 @@
[mtlsamplernearest release];
[mtlsamplerlinear release];
[mtlbufconstants release];
+ [mtlbufquadindices release];
[view release];
[data release];
[mtldevice release];