src/core/SkThreadedBMPDevice.cpp - skia - Git at Google

 /*
  * Copyright 2017 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #include "SkThreadedBMPDevice.h"

 #include "SkPath.h"
 #include "SkTaskGroup.h"
 #include "SkVertices.h"

 #include <mutex>
 #include <vector>

 constexpr int MAX_CACHE_LINE = 64;

 // Some basic logics and data structures that are shared across the current experimental schedulers.
 class TiledDrawSchedulerBase : public TiledDrawScheduler {
 public:
     TiledDrawSchedulerBase(int tiles, WorkFunc work)
             : fTileCnt(tiles), fIsFinishing(false), fDrawCnt(0), fWork(std::move(work)) {}

     void signal() override {
         fDrawCnt++;
     }
     void finish() override {
         fIsFinishing.store(true, std::memory_order_relaxed);
     }

 protected:
     const int                   fTileCnt;
     std::atomic<bool>           fIsFinishing;
     std::atomic<int>            fDrawCnt;
     WorkFunc                    fWork;
 };

 class TiledDrawSchedulerBySpinning : public TiledDrawSchedulerBase {
 public:
     TiledDrawSchedulerBySpinning(int tiles, WorkFunc work)
             : TiledDrawSchedulerBase(tiles, std::move(work)), fScheduleData(tiles) {}

     void signal() final { this->TiledDrawSchedulerBase::signal(); }
     void finish() final { this->TiledDrawSchedulerBase::finish(); }

     bool next(int& tileIndex) final {
         int& drawIndex = fScheduleData[tileIndex].fDrawIndex;
         SkASSERT(drawIndex <= fDrawCnt);
         while (true) {
             bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
             if (isFinishing && drawIndex >= fDrawCnt) {
                 return false;
             } else if (drawIndex < fDrawCnt) {
                 fWork(tileIndex, drawIndex++);
                 return true;
             }
         }
     }

 private:
     // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
     struct alignas(MAX_CACHE_LINE) TileScheduleData {
         TileScheduleData() : fDrawIndex(0) {}

         int fDrawIndex; // next draw index for this tile
     };

     std::vector<TileScheduleData>  fScheduleData;
 };

 class TiledDrawSchedulerFlexible : public TiledDrawSchedulerBase {
 public:
     TiledDrawSchedulerFlexible(int tiles, WorkFunc work)
             : TiledDrawSchedulerBase(tiles, std::move(work)), fScheduleData(tiles) {}

     void signal() final { this->TiledDrawSchedulerBase::signal(); }
     void finish() final { this->TiledDrawSchedulerBase::finish(); }

     bool next(int& tileIndex) final {
         int failCnt = 0;
         while (true) {
             TileScheduleData& scheduleData = fScheduleData[tileIndex];
             bool locked = scheduleData.fMutex.try_lock();
             bool processed = false;

             if (locked) {
                 if (scheduleData.fDrawIndex < fDrawCnt) {
                     fWork(tileIndex, scheduleData.fDrawIndex++);
                     processed = true;
                 } else {
                     failCnt += fIsFinishing.load(std::memory_order_relaxed);
                 }
                 scheduleData.fMutex.unlock();
             }

             if (processed) {
                 return true;
             } else {
                 if (failCnt >= fTileCnt) {
                     return false;
                 }
                 tileIndex = (tileIndex + 1) % fTileCnt;
             }
         }
     }

 private:
     // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
     struct alignas(MAX_CACHE_LINE) TileScheduleData {
         TileScheduleData() : fDrawIndex(0) {}

         int         fDrawIndex; // next draw index for this tile
         std::mutex  fMutex;     // the mutex for the thread to acquire
     };

     std::vector<TileScheduleData>  fScheduleData;
 };

 class TiledDrawSchedulerBySemaphores : public TiledDrawSchedulerBase {
 public:
     TiledDrawSchedulerBySemaphores(int tiles, WorkFunc work)
             : TiledDrawSchedulerBase(tiles, std::move(work)), fScheduleData(tiles) {}


     void signal() final {
         this->TiledDrawSchedulerBase::signal();
         signalRoot();
     }

     void finish() final {
         this->TiledDrawSchedulerBase::finish();
         signalRoot();
     }

     bool next(int& tileIndex) final {
         SkASSERT(tileIndex >= 0 && tileIndex < fTileCnt);
         TileScheduleData& scheduleData = fScheduleData[tileIndex];
         while (true) {
             scheduleData.fSemaphore.wait();
             int leftChild = (tileIndex + 1) * 2 - 1;
             int rightChild = leftChild + 1;
             if (leftChild < fTileCnt) {
                 fScheduleData[leftChild].fSemaphore.signal();
             }
             if (rightChild < fTileCnt) {
                 fScheduleData[rightChild].fSemaphore.signal();
             }

             bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
             if (isFinishing && scheduleData.fDrawIndex >= fDrawCnt) {
                 return false;
             } else {
                 SkASSERT(scheduleData.fDrawIndex < fDrawCnt);
                 fWork(tileIndex, scheduleData.fDrawIndex++);
                 return true;
             }
         }
     }

 private:
     // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
     struct alignas(MAX_CACHE_LINE) TileScheduleData {
         TileScheduleData() : fDrawIndex(0) {}

         int         fDrawIndex;
         SkSemaphore fSemaphore;
     };

     void signalRoot() {
         SkASSERT(fTileCnt > 0);
         fScheduleData[0].fSemaphore.signal();
     }

     std::vector<TileScheduleData> fScheduleData;
 };

 void SkThreadedBMPDevice::startThreads() {
     SkASSERT(fQueueSize == 0);

     TiledDrawScheduler::WorkFunc work = [this](int tileIndex, int drawIndex){
         auto& element = fQueue[drawIndex];
         if (SkIRect::Intersects(fTileBounds[tileIndex], element.fDrawBounds)) {
             element.fDrawFn(fTileBounds[tileIndex]);
         }
     };

     // using Scheduler = TiledDrawSchedulerBySemaphores;
     // using Scheduler = TiledDrawSchedulerBySpinning;
     using Scheduler = TiledDrawSchedulerFlexible;
     fScheduler.reset(new Scheduler(fTileCnt, work));

     // We intentionally call the int parameter tileIndex although it ranges from 0 to fThreadCnt-1.
     // For some schedulers (e.g., TiledDrawSchedulerBySemaphores and TiledDrawSchedulerBySpinning),
     // fThreadCnt should be equal to fTileCnt so it doesn't make a difference.
     //
     // For TiledDrawSchedulerFlexible, the input tileIndex provides only a hint about which tile
     // the current thread should draw; the scheduler may later modify that tileIndex to draw on
     // another tile.
     fTaskGroup->batch(fThreadCnt, [this](int tileIndex){
         while (fScheduler->next(tileIndex)) {}
     });
 }

 void SkThreadedBMPDevice::finishThreads() {
     fScheduler->finish();
     fTaskGroup->wait();
     fQueueSize = 0;
     fScheduler.reset(nullptr);
 }

 SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap,
                                          int tiles,
                                          int threads,
                                          SkExecutor* executor)
         : INHERITED(bitmap)
         , fTileCnt(tiles)
         , fThreadCnt(threads <= 0 ? tiles : threads)
 {
     if (executor == nullptr) {
         fInternalExecutor = SkExecutor::MakeFIFOThreadPool(fThreadCnt);
         executor = fInternalExecutor.get();
     }
     fExecutor = executor;

     // Tiling using stripes for now; we'll explore better tiling in the future.
     int h = (bitmap.height() + fTileCnt - 1) / SkTMax(fTileCnt, 1);
     int w = bitmap.width();
     int top = 0;
     for(int tid = 0; tid < fTileCnt; ++tid, top += h) {
         fTileBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h));
     }
     fQueueSize = 0;
     fTaskGroup.reset(new SkTaskGroup(*fExecutor));
     startThreads();
 }

 void SkThreadedBMPDevice::flush() {
     finishThreads();
     startThreads();
 }

 // Having this captured in lambda seems to be faster than saving this in DrawElement
 struct SkThreadedBMPDevice::DrawState {
     SkPixmap fDst;
     SkMatrix fMatrix;
     SkRasterClip fRC;

     explicit DrawState(SkThreadedBMPDevice* dev) {
         // we need fDst to be set, and if we're actually drawing, to dirty the genID
         if (!dev->accessPixels(&fDst)) {
             // NoDrawDevice uses us (why?) so we have to catch this case w/ no pixels
             fDst.reset(dev->imageInfo(), nullptr, 0);
         }
         fMatrix = dev->ctm();
         fRC = dev->fRCStack.rc();
     }

     SkDraw getThreadDraw(SkRasterClip& threadRC, const SkIRect& threadBounds) const {
         SkDraw draw;
         draw.fDst = fDst;
         draw.fMatrix = &fMatrix;
         threadRC = fRC;
         threadRC.op(threadBounds, SkRegion::kIntersect_Op);
         draw.fRC = &threadRC;
         return draw;
     }
 };

 SkIRect SkThreadedBMPDevice::transformDrawBounds(const SkRect& drawBounds) const {
     if (drawBounds.isLargest()) {
         return SkIRect::MakeLargest();
     }
     SkRect transformedBounds;
     this->ctm().mapRect(&transformedBounds, drawBounds);
     return transformedBounds.roundOut();
 }

 // The do {...} while (false) is to enforce trailing semicolon as suggested by mtklein@
 #define THREADED_DRAW(drawBounds, actualDrawCall)                                                  \
     do {                                                                                           \
         if (fQueueSize == MAX_QUEUE_SIZE) {                                                        \
             this->flush();                                                                         \
         }                                                                                          \
         DrawState ds(this);                                                                        \
         SkASSERT(fQueueSize < MAX_QUEUE_SIZE);                                                     \
         fQueue[fQueueSize++] = {                                                                   \
             this->transformDrawBounds(drawBounds),                                                 \
             [=](const SkIRect& tileBounds) {                                                       \
                 SkRasterClip tileRC;                                                               \
                 SkDraw draw = ds.getThreadDraw(tileRC, tileBounds);                                \
                 draw.actualDrawCall;                                                               \
             },                                                                                     \
         };                                                                                         \
         fScheduler->signal();                                                                      \
     } while (false)

 static inline SkRect get_fast_bounds(const SkRect& r, const SkPaint& p) {
     SkRect result;
     if (p.canComputeFastBounds()) {
         result = p.computeFastBounds(r, &result);
     } else {
         result = SkRect::MakeLargest();
     }
     return result;
 }

 void SkThreadedBMPDevice::drawPaint(const SkPaint& paint) {
     THREADED_DRAW(SkRect::MakeLargest(), drawPaint(paint));
 }

 void SkThreadedBMPDevice::drawPoints(SkCanvas::PointMode mode, size_t count,
         const SkPoint pts[], const SkPaint& paint) {
     // TODO tighter drawBounds
     SkRect drawBounds = SkRect::MakeLargest();
     THREADED_DRAW(drawBounds, drawPoints(mode, count, pts, paint, nullptr));
 }

 void SkThreadedBMPDevice::drawRect(const SkRect& r, const SkPaint& paint) {
     SkRect drawBounds = get_fast_bounds(r, paint);
     THREADED_DRAW(drawBounds, drawRect(r, paint));
 }

 void SkThreadedBMPDevice::drawRRect(const SkRRect& rrect, const SkPaint& paint) {
 #ifdef SK_IGNORE_BLURRED_RRECT_OPT
     SkPath  path;

     path.addRRect(rrect);
     // call the VIRTUAL version, so any subclasses who do handle drawPath aren't
     // required to override drawRRect.
     this->drawPath(path, paint, nullptr, false);
 #else
     SkRect drawBounds = get_fast_bounds(rrect.getBounds(), paint);
     THREADED_DRAW(drawBounds, drawRRect(rrect, paint));
 #endif
 }

 void SkThreadedBMPDevice::drawPath(const SkPath& path, const SkPaint& paint,
         const SkMatrix* prePathMatrix, bool pathIsMutable) {
     SkRect drawBounds = path.isInverseFillType() ? SkRect::MakeLargest()
                                                  : get_fast_bounds(path.getBounds(), paint);
     // For thread safety, make path imutable
     THREADED_DRAW(drawBounds, drawPath(path, paint, prePathMatrix, false));
 }

 void SkThreadedBMPDevice::drawBitmap(const SkBitmap& bitmap, SkScalar x, SkScalar y,
         const SkPaint& paint) {
     SkMatrix matrix = SkMatrix::MakeTrans(x, y);
     LogDrawScaleFactor(SkMatrix::Concat(this->ctm(), matrix), paint.getFilterQuality());
     SkRect drawBounds = SkRect::MakeWH(bitmap.width(), bitmap.height());
     matrix.mapRect(&drawBounds);
     THREADED_DRAW(drawBounds, drawBitmap(bitmap, matrix, nullptr, paint));
 }

 void SkThreadedBMPDevice::drawSprite(const SkBitmap& bitmap, int x, int y, const SkPaint& paint) {
     SkRect drawBounds = SkRect::MakeXYWH(x, y, bitmap.width(), bitmap.height());
     THREADED_DRAW(drawBounds, drawSprite(bitmap, x, y, paint));
 }

 void SkThreadedBMPDevice::drawText(const void* text, size_t len, SkScalar x, SkScalar y,
         const SkPaint& paint) {
     SkRect drawBounds = SkRect::MakeLargest(); // TODO tighter drawBounds
     THREADED_DRAW(drawBounds, drawText((const char*)text, len, x, y, paint, &this->surfaceProps()));
 }

 void SkThreadedBMPDevice::drawPosText(const void* text, size_t len, const SkScalar xpos[],
         int scalarsPerPos, const SkPoint& offset, const SkPaint& paint) {
     SkRect drawBounds = SkRect::MakeLargest(); // TODO tighter drawBounds
     THREADED_DRAW(drawBounds, drawPosText((const char*)text, len, xpos, scalarsPerPos, offset,
                                           paint, &surfaceProps()));
 }

 void SkThreadedBMPDevice::drawVertices(const SkVertices* vertices, SkBlendMode bmode,
         const SkPaint& paint) {
     SkRect drawBounds = SkRect::MakeLargest(); // TODO tighter drawBounds
     THREADED_DRAW(drawBounds, drawVertices(vertices->mode(), vertices->vertexCount(),
                                            vertices->positions(), vertices->texCoords(),
                                            vertices->colors(), bmode, vertices->indices(),
                                            vertices->indexCount(), paint));
 }

 void SkThreadedBMPDevice::drawDevice(SkBaseDevice* device, int x, int y, const SkPaint& paint) {
     SkASSERT(!paint.getImageFilter());
     SkRect drawBounds = SkRect::MakeXYWH(x, y, device->width(), device->height());
     THREADED_DRAW(drawBounds,
                   drawSprite(static_cast<SkBitmapDevice*>(device)->fBitmap, x, y, paint));
 }
	/*
	* Copyright 2017 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#include "SkThreadedBMPDevice.h"

	#include "SkPath.h"
	#include "SkTaskGroup.h"
	#include "SkVertices.h"

	#include <mutex>
	#include <vector>

	constexpr int MAX_CACHE_LINE = 64;

	// Some basic logics and data structures that are shared across the current experimental schedulers.
	class TiledDrawSchedulerBase : public TiledDrawScheduler {
	public:
	TiledDrawSchedulerBase(int tiles, WorkFunc work)
	: fTileCnt(tiles), fIsFinishing(false), fDrawCnt(0), fWork(std::move(work)) {}

	void signal() override {
	fDrawCnt++;
	}
	void finish() override {
	fIsFinishing.store(true, std::memory_order_relaxed);
	}

	protected:
	const int fTileCnt;
	std::atomic<bool> fIsFinishing;
	std::atomic<int> fDrawCnt;
	WorkFunc fWork;
	};

	class TiledDrawSchedulerBySpinning : public TiledDrawSchedulerBase {
	public:
	TiledDrawSchedulerBySpinning(int tiles, WorkFunc work)
	: TiledDrawSchedulerBase(tiles, std::move(work)), fScheduleData(tiles) {}

	void signal() final { this->TiledDrawSchedulerBase::signal(); }
	void finish() final { this->TiledDrawSchedulerBase::finish(); }

	bool next(int& tileIndex) final {
	int& drawIndex = fScheduleData[tileIndex].fDrawIndex;
	SkASSERT(drawIndex <= fDrawCnt);
	while (true) {
	bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
	if (isFinishing && drawIndex >= fDrawCnt) {
	return false;
	} else if (drawIndex < fDrawCnt) {
	fWork(tileIndex, drawIndex++);
	return true;
	}
	}
	}

	private:
	// alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
	struct alignas(MAX_CACHE_LINE) TileScheduleData {
	TileScheduleData() : fDrawIndex(0) {}

	int fDrawIndex; // next draw index for this tile
	};

	std::vector<TileScheduleData> fScheduleData;
	};

	class TiledDrawSchedulerFlexible : public TiledDrawSchedulerBase {
	public:
	TiledDrawSchedulerFlexible(int tiles, WorkFunc work)
	: TiledDrawSchedulerBase(tiles, std::move(work)), fScheduleData(tiles) {}

	void signal() final { this->TiledDrawSchedulerBase::signal(); }
	void finish() final { this->TiledDrawSchedulerBase::finish(); }

	bool next(int& tileIndex) final {
	int failCnt = 0;
	while (true) {
	TileScheduleData& scheduleData = fScheduleData[tileIndex];
	bool locked = scheduleData.fMutex.try_lock();
	bool processed = false;

	if (locked) {
	if (scheduleData.fDrawIndex < fDrawCnt) {
	fWork(tileIndex, scheduleData.fDrawIndex++);
	processed = true;
	} else {
	failCnt += fIsFinishing.load(std::memory_order_relaxed);
	}
	scheduleData.fMutex.unlock();
	}

	if (processed) {
	return true;
	} else {
	if (failCnt >= fTileCnt) {
	return false;
	}
	tileIndex = (tileIndex + 1) % fTileCnt;
	}
	}
	}

	private:
	// alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
	struct alignas(MAX_CACHE_LINE) TileScheduleData {
	TileScheduleData() : fDrawIndex(0) {}

	int fDrawIndex; // next draw index for this tile
	std::mutex fMutex; // the mutex for the thread to acquire
	};

	std::vector<TileScheduleData> fScheduleData;
	};

	class TiledDrawSchedulerBySemaphores : public TiledDrawSchedulerBase {
	public:
	TiledDrawSchedulerBySemaphores(int tiles, WorkFunc work)
	: TiledDrawSchedulerBase(tiles, std::move(work)), fScheduleData(tiles) {}


	void signal() final {
	this->TiledDrawSchedulerBase::signal();
	signalRoot();
	}

	void finish() final {
	this->TiledDrawSchedulerBase::finish();
	signalRoot();
	}

	bool next(int& tileIndex) final {
	SkASSERT(tileIndex >= 0 && tileIndex < fTileCnt);
	TileScheduleData& scheduleData = fScheduleData[tileIndex];
	while (true) {
	scheduleData.fSemaphore.wait();
	int leftChild = (tileIndex + 1) * 2 - 1;
	int rightChild = leftChild + 1;
	if (leftChild < fTileCnt) {
	fScheduleData[leftChild].fSemaphore.signal();
	}
	if (rightChild < fTileCnt) {
	fScheduleData[rightChild].fSemaphore.signal();
	}

	bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
	if (isFinishing && scheduleData.fDrawIndex >= fDrawCnt) {
	return false;
	} else {
	SkASSERT(scheduleData.fDrawIndex < fDrawCnt);
	fWork(tileIndex, scheduleData.fDrawIndex++);
	return true;
	}
	}
	}

	private:
	// alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
	struct alignas(MAX_CACHE_LINE) TileScheduleData {
	TileScheduleData() : fDrawIndex(0) {}

	int fDrawIndex;
	SkSemaphore fSemaphore;
	};

	void signalRoot() {
	SkASSERT(fTileCnt > 0);
	fScheduleData[0].fSemaphore.signal();
	}

	std::vector<TileScheduleData> fScheduleData;
	};

	void SkThreadedBMPDevice::startThreads() {
	SkASSERT(fQueueSize == 0);

	TiledDrawScheduler::WorkFunc work = [this](int tileIndex, int drawIndex){
	auto& element = fQueue[drawIndex];
	if (SkIRect::Intersects(fTileBounds[tileIndex], element.fDrawBounds)) {
	element.fDrawFn(fTileBounds[tileIndex]);
	}
	};

	// using Scheduler = TiledDrawSchedulerBySemaphores;
	// using Scheduler = TiledDrawSchedulerBySpinning;
	using Scheduler = TiledDrawSchedulerFlexible;
	fScheduler.reset(new Scheduler(fTileCnt, work));

	// We intentionally call the int parameter tileIndex although it ranges from 0 to fThreadCnt-1.
	// For some schedulers (e.g., TiledDrawSchedulerBySemaphores and TiledDrawSchedulerBySpinning),
	// fThreadCnt should be equal to fTileCnt so it doesn't make a difference.
	//
	// For TiledDrawSchedulerFlexible, the input tileIndex provides only a hint about which tile
	// the current thread should draw; the scheduler may later modify that tileIndex to draw on
	// another tile.
	fTaskGroup->batch(fThreadCnt, [this](int tileIndex){
	while (fScheduler->next(tileIndex)) {}
	});
	}

	void SkThreadedBMPDevice::finishThreads() {
	fScheduler->finish();
	fTaskGroup->wait();
	fQueueSize = 0;
	fScheduler.reset(nullptr);
	}

	SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap,
	int tiles,
	int threads,
	SkExecutor* executor)
	: INHERITED(bitmap)
	, fTileCnt(tiles)
	, fThreadCnt(threads <= 0 ? tiles : threads)
	{
	if (executor == nullptr) {
	fInternalExecutor = SkExecutor::MakeFIFOThreadPool(fThreadCnt);
	executor = fInternalExecutor.get();
	}
	fExecutor = executor;

	// Tiling using stripes for now; we'll explore better tiling in the future.
	int h = (bitmap.height() + fTileCnt - 1) / SkTMax(fTileCnt, 1);
	int w = bitmap.width();
	int top = 0;
	for(int tid = 0; tid < fTileCnt; ++tid, top += h) {
	fTileBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h));
	}
	fQueueSize = 0;
	fTaskGroup.reset(new SkTaskGroup(*fExecutor));
	startThreads();
	}

	void SkThreadedBMPDevice::flush() {
	finishThreads();
	startThreads();
	}

	// Having this captured in lambda seems to be faster than saving this in DrawElement
	struct SkThreadedBMPDevice::DrawState {
	SkPixmap fDst;
	SkMatrix fMatrix;
	SkRasterClip fRC;

	explicit DrawState(SkThreadedBMPDevice* dev) {
	// we need fDst to be set, and if we're actually drawing, to dirty the genID
	if (!dev->accessPixels(&fDst)) {
	// NoDrawDevice uses us (why?) so we have to catch this case w/ no pixels
	fDst.reset(dev->imageInfo(), nullptr, 0);
	}
	fMatrix = dev->ctm();
	fRC = dev->fRCStack.rc();
	}

	SkDraw getThreadDraw(SkRasterClip& threadRC, const SkIRect& threadBounds) const {
	SkDraw draw;
	draw.fDst = fDst;
	draw.fMatrix = &fMatrix;
	threadRC = fRC;
	threadRC.op(threadBounds, SkRegion::kIntersect_Op);
	draw.fRC = &threadRC;
	return draw;
	}
	};

	SkIRect SkThreadedBMPDevice::transformDrawBounds(const SkRect& drawBounds) const {
	if (drawBounds.isLargest()) {
	return SkIRect::MakeLargest();
	}
	SkRect transformedBounds;
	this->ctm().mapRect(&transformedBounds, drawBounds);
	return transformedBounds.roundOut();
	}

	// The do {...} while (false) is to enforce trailing semicolon as suggested by mtklein@
	#define THREADED_DRAW(drawBounds, actualDrawCall) \
	do { \
	if (fQueueSize == MAX_QUEUE_SIZE) { \
	this->flush(); \
	} \
	DrawState ds(this); \
	SkASSERT(fQueueSize < MAX_QUEUE_SIZE); \
	fQueue[fQueueSize++] = { \
	this->transformDrawBounds(drawBounds), \
	[=](const SkIRect& tileBounds) { \
	SkRasterClip tileRC; \
	SkDraw draw = ds.getThreadDraw(tileRC, tileBounds); \
	draw.actualDrawCall; \
	}, \
	}; \
	fScheduler->signal(); \
	} while (false)

	static inline SkRect get_fast_bounds(const SkRect& r, const SkPaint& p) {
	SkRect result;
	if (p.canComputeFastBounds()) {
	result = p.computeFastBounds(r, &result);
	} else {
	result = SkRect::MakeLargest();
	}
	return result;
	}

	void SkThreadedBMPDevice::drawPaint(const SkPaint& paint) {
	THREADED_DRAW(SkRect::MakeLargest(), drawPaint(paint));
	}

	void SkThreadedBMPDevice::drawPoints(SkCanvas::PointMode mode, size_t count,
	const SkPoint pts[], const SkPaint& paint) {
	// TODO tighter drawBounds
	SkRect drawBounds = SkRect::MakeLargest();
	THREADED_DRAW(drawBounds, drawPoints(mode, count, pts, paint, nullptr));
	}

	void SkThreadedBMPDevice::drawRect(const SkRect& r, const SkPaint& paint) {
	SkRect drawBounds = get_fast_bounds(r, paint);
	THREADED_DRAW(drawBounds, drawRect(r, paint));
	}

	void SkThreadedBMPDevice::drawRRect(const SkRRect& rrect, const SkPaint& paint) {
	#ifdef SK_IGNORE_BLURRED_RRECT_OPT
	SkPath path;

	path.addRRect(rrect);
	// call the VIRTUAL version, so any subclasses who do handle drawPath aren't
	// required to override drawRRect.
	this->drawPath(path, paint, nullptr, false);
	#else
	SkRect drawBounds = get_fast_bounds(rrect.getBounds(), paint);
	THREADED_DRAW(drawBounds, drawRRect(rrect, paint));
	#endif
	}

	void SkThreadedBMPDevice::drawPath(const SkPath& path, const SkPaint& paint,
	const SkMatrix* prePathMatrix, bool pathIsMutable) {
	SkRect drawBounds = path.isInverseFillType() ? SkRect::MakeLargest()
	: get_fast_bounds(path.getBounds(), paint);
	// For thread safety, make path imutable
	THREADED_DRAW(drawBounds, drawPath(path, paint, prePathMatrix, false));
	}

	void SkThreadedBMPDevice::drawBitmap(const SkBitmap& bitmap, SkScalar x, SkScalar y,
	const SkPaint& paint) {
	SkMatrix matrix = SkMatrix::MakeTrans(x, y);
	LogDrawScaleFactor(SkMatrix::Concat(this->ctm(), matrix), paint.getFilterQuality());
	SkRect drawBounds = SkRect::MakeWH(bitmap.width(), bitmap.height());
	matrix.mapRect(&drawBounds);
	THREADED_DRAW(drawBounds, drawBitmap(bitmap, matrix, nullptr, paint));
	}

	void SkThreadedBMPDevice::drawSprite(const SkBitmap& bitmap, int x, int y, const SkPaint& paint) {
	SkRect drawBounds = SkRect::MakeXYWH(x, y, bitmap.width(), bitmap.height());
	THREADED_DRAW(drawBounds, drawSprite(bitmap, x, y, paint));
	}

	void SkThreadedBMPDevice::drawText(const void* text, size_t len, SkScalar x, SkScalar y,
	const SkPaint& paint) {
	SkRect drawBounds = SkRect::MakeLargest(); // TODO tighter drawBounds
	THREADED_DRAW(drawBounds, drawText((const char*)text, len, x, y, paint, &this->surfaceProps()));
	}

	void SkThreadedBMPDevice::drawPosText(const void* text, size_t len, const SkScalar xpos[],
	int scalarsPerPos, const SkPoint& offset, const SkPaint& paint) {
	SkRect drawBounds = SkRect::MakeLargest(); // TODO tighter drawBounds
	THREADED_DRAW(drawBounds, drawPosText((const char*)text, len, xpos, scalarsPerPos, offset,
	paint, &surfaceProps()));
	}

	void SkThreadedBMPDevice::drawVertices(const SkVertices* vertices, SkBlendMode bmode,
	const SkPaint& paint) {
	SkRect drawBounds = SkRect::MakeLargest(); // TODO tighter drawBounds
	THREADED_DRAW(drawBounds, drawVertices(vertices->mode(), vertices->vertexCount(),
	vertices->positions(), vertices->texCoords(),
	vertices->colors(), bmode, vertices->indices(),
	vertices->indexCount(), paint));
	}

	void SkThreadedBMPDevice::drawDevice(SkBaseDevice* device, int x, int y, const SkPaint& paint) {
	SkASSERT(!paint.getImageFilter());
	SkRect drawBounds = SkRect::MakeXYWH(x, y, device->width(), device->height());
	THREADED_DRAW(drawBounds,
	drawSprite(static_cast<SkBitmapDevice*>(device)->fBitmap, x, y, paint));
	}