example_transcoding/utils.cpp - external/github.com/BinomialLLC/basis_universal - Git at Google

 // File: utils.cpp
 #include "utils.h"
 //#include "lodepng.h"
 //#include "miniz.h"

 namespace utils
 {

 #define FLOOD_PUSH(y, xl, xr, dy) if (((y + (dy)) >= 0) && ((y + (dy)) < (int)m_height)) { stack.push_back(fill_segment(y, xl, xr, dy)); }

 // See http://www.realtimerendering.com/resources/GraphicsGems/gems/SeedFill.c
 uint32_t image_u8::flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, std::vector<pixel_coord>* pSet_pixels)
 {
 	uint32_t total_set = 0;

 	if (!flood_fill_is_inside(x, y, b))
 		return 0;

 	std::vector<fill_segment> stack;
 	stack.reserve(64);

 	FLOOD_PUSH(y, x, x, 1);
 	FLOOD_PUSH(y + 1, x, x, -1);

 	while (stack.size())
 	{
 		fill_segment s = stack.back();
 		stack.pop_back();

 		int x1 = s.m_xl, x2 = s.m_xr, dy = s.m_dy;
 		y = s.m_y + s.m_dy;

 		for (x = x1; (x >= 0) && flood_fill_is_inside(x, y, b); x--)
 		{
 			(*this)(x, y) = c;
 			total_set++;
 			if (pSet_pixels)
 				pSet_pixels->push_back(pixel_coord(x, y));
 		}

 		int l;

 		if (x >= x1)
 			goto skip;

 		l = x + 1;
 		if (l < x1)
 			FLOOD_PUSH(y, l, x1 - 1, -dy);

 		x = x1 + 1;

 		do
 		{
 			for (; x <= ((int)m_width - 1) && flood_fill_is_inside(x, y, b); x++)
 			{
 				(*this)(x, y) = c;
 				total_set++;
 				if (pSet_pixels)
 					pSet_pixels->push_back(pixel_coord(x, y));
 			}
 			FLOOD_PUSH(y, l, x - 1, dy);

 			if (x > (x2 + 1))
 				FLOOD_PUSH(y, x2 + 1, x - 1, -dy);

 		skip:
 			for (x++; x <= x2 && !flood_fill_is_inside(x, y, b); x++)
 				;

 			l = x;
 		} while (x <= x2);
 	}

 	return total_set;
 }

 void image_u8::draw_line(int xs, int ys, int xe, int ye, const color_quad_u8& color)
 {
 	if (xs > xe)
 	{
 		std::swap(xs, xe);
 		std::swap(ys, ye);
 	}

 	int dx = xe - xs, dy = ye - ys;
 	if (!dx)
 	{
 		if (ys > ye)
 			std::swap(ys, ye);
 		for (int i = ys; i <= ye; i++)
 			set_pixel_clipped(xs, i, color);
 	}
 	else if (!dy)
 	{
 		for (int i = xs; i < xe; i++)
 			set_pixel_clipped(i, ys, color);
 	}
 	else if (dy > 0)
 	{
 		if (dy <= dx)
 		{
 			int e = 2 * dy - dx, e_no_inc = 2 * dy, e_inc = 2 * (dy - dx);
 			rasterize_line(xs, ys, xe, ye, 0, 1, e, e_inc, e_no_inc, color);
 		}
 		else
 		{
 			int e = 2 * dx - dy, e_no_inc = 2 * dx, e_inc = 2 * (dx - dy);
 			rasterize_line(xs, ys, xe, ye, 1, 1, e, e_inc, e_no_inc, color);
 		}
 	}
 	else
 	{
 		dy = -dy;
 		if (dy <= dx)
 		{
 			int e = 2 * dy - dx, e_no_inc = 2 * dy, e_inc = 2 * (dy - dx);
 			rasterize_line(xs, ys, xe, ye, 0, -1, e, e_inc, e_no_inc, color);
 		}
 		else
 		{
 			int e = 2 * dx - dy, e_no_inc = (2 * dx), e_inc = 2 * (dx - dy);
 			rasterize_line(xe, ye, xs, ys, 1, -1, e, e_inc, e_no_inc, color);
 		}
 	}
 }

 void image_u8::rasterize_line(int xs, int ys, int xe, int ye, int pred, int inc_dec, int e, int e_inc, int e_no_inc, const color_quad_u8& color)
 {
 	int start, end, var;

 	if (pred)
 	{
 		start = ys;
 		end = ye;
 		var = xs;
 		for (int i = start; i <= end; i++)
 		{
 			set_pixel_clipped(var, i, color);
 			if (e < 0)
 				e += e_no_inc;
 			else
 			{
 				var += inc_dec;
 				e += e_inc;
 			}
 		}
 	}
 	else
 	{
 		start = xs;
 		end = xe;
 		var = ys;
 		for (int i = start; i <= end; i++)
 		{
 			set_pixel_clipped(i, var, color);
 			if (e < 0)
 				e += e_no_inc;
 			else
 			{
 				var += inc_dec;
 				e += e_inc;
 			}
 		}
 	}
 }

 #if 0
 bool load_png(const char* pFilename, image_u8& img)
 {
 	img.clear();

 	std::vector<unsigned char> pixels;
 	unsigned int w = 0, h = 0;
 	unsigned int e = lodepng::decode(pixels, w, h, pFilename);
 	if (e != 0)
 	{
 		fprintf(stderr, "Failed loading PNG file %s\n", pFilename);
 		return false;
 	}

 	img.init(w, h);
 	memcpy(&img.get_pixels()[0], &pixels[0], w * h * sizeof(uint32_t));

 	return true;
 }

 bool save_png(const char* pFilename, const image_u8& img, bool save_alpha)
 {
 	const uint32_t w = img.width();
 	const uint32_t h = img.height();

 	std::vector<unsigned char> pixels;
 	if (save_alpha)
 	{
 		pixels.resize(w * h * sizeof(color_quad_u8));
 		memcpy(&pixels[0], &img.get_pixels()[0], w * h * sizeof(color_quad_u8));
 	}
 	else
 	{
 		pixels.resize(w * h * 3);
 		unsigned char* pDst = &pixels[0];
 		for (uint32_t y = 0; y < h; y++)
 			for (uint32_t x = 0; x < w; x++, pDst += 3)
 				pDst[0] = img(x, y)[0], pDst[1] = img(x, y)[1], pDst[2] = img(x, y)[2];
 	}

 	return lodepng::encode(pFilename, pixels, w, h, save_alpha ? LCT_RGBA : LCT_RGB) == 0;
 }
 #endif

 static float gauss(int x, int y, float sigma_sqr)
 {
 	float pow = expf(-((x * x + y * y) / (2.0f * sigma_sqr)));
 	float g = (1.0f / (sqrtf((float)(2.0f * M_PI * sigma_sqr)))) * pow;
 	return g;
 }

 // size_x/y should be odd
 void compute_gaussian_kernel(float* pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags)
 {
 	assert(size_x & size_y & 1);

 	if (!(size_x | size_y))
 		return;

 	int mid_x = size_x / 2;
 	int mid_y = size_y / 2;

 	double sum = 0;
 	for (int x = 0; x < size_x; x++)
 	{
 		for (int y = 0; y < size_y; y++)
 		{
 			float g;
 			if ((x > mid_x) && (y < mid_y))
 				g = pDst[(size_x - x - 1) + y * size_x];
 			else if ((x < mid_x) && (y > mid_y))
 				g = pDst[x + (size_y - y - 1) * size_x];
 			else if ((x > mid_x) && (y > mid_y))
 				g = pDst[(size_x - x - 1) + (size_y - y - 1) * size_x];
 			else
 				g = gauss(x - mid_x, y - mid_y, sigma_sqr);

 			pDst[x + y * size_x] = g;
 			sum += g;
 		}
 	}

 	if (flags & cComputeGaussianFlagNormalizeCenterToOne)
 	{
 		sum = pDst[mid_x + mid_y * size_x];
 	}

 	if (flags & (cComputeGaussianFlagNormalizeCenterToOne | cComputeGaussianFlagNormalize))
 	{
 		double one_over_sum = 1.0f / sum;
 		for (int i = 0; i < size_x * size_y; i++)
 			pDst[i] = static_cast<float>(pDst[i] * one_over_sum);

 		if (flags & cComputeGaussianFlagNormalizeCenterToOne)
 			pDst[mid_x + mid_y * size_x] = 1.0f;
 	}

 	if (flags & cComputeGaussianFlagPrint)
 	{
 		printf("{\n");
 		for (int y = 0; y < size_y; y++)
 		{
 			printf("  ");
 			for (int x = 0; x < size_x; x++)
 			{
 				printf("%f, ", pDst[x + y * size_x]);
 			}
 			printf("\n");
 		}
 		printf("}");
 	}
 }

 void gaussian_filter(imagef& dst, const imagef& orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor)
 {
 	assert(odd_filter_width && (odd_filter_width & 1));
 	odd_filter_width |= 1;

 	std::vector<float> kernel(odd_filter_width * odd_filter_width);
 	compute_gaussian_kernel(&kernel[0], odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize);

 	const int dst_width = orig_img.get_width() / width_divisor;
 	const int dst_height = orig_img.get_height() / height_divisor;

 	const int H = odd_filter_width / 2;
 	const int L = -H;

 	dst.crop(dst_width, dst_height);

 //#pragma omp parallel for
 	for (int oy = 0; oy < dst_height; oy++)
 	{
 		for (int ox = 0; ox < dst_width; ox++)
 		{
 			vec4F c(0.0f);

 			for (int yd = L; yd <= H; yd++)
 			{
 				int y = oy * height_divisor + (height_divisor >> 1) + yd;

 				for (int xd = L; xd <= H; xd++)
 				{
 					int x = ox * width_divisor + (width_divisor >> 1) + xd;

 					const vec4F& p = orig_img.get_clamped_or_wrapped(x, y, wrapping, wrapping);

 					float w = kernel[(xd + H) + (yd + H) * odd_filter_width];
 					c[0] += p[0] * w;
 					c[1] += p[1] * w;
 					c[2] += p[2] * w;
 					c[3] += p[3] * w;
 				}
 			}

 			dst(ox, oy).set(c[0], c[1], c[2], c[3]);
 		}
 	}
 }

 static void pow_image(const imagef& src, imagef& dst, const vec4F& power)
 {
 	dst.resize(src);

 //#pragma omp parallel for
 	for (int y = 0; y < (int)dst.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < dst.get_width(); x++)
 		{
 			const vec4F& p = src(x, y);

 			if ((power[0] == 2.0f) && (power[1] == 2.0f) && (power[2] == 2.0f) && (power[3] == 2.0f))
 				dst(x, y).set(p[0] * p[0], p[1] * p[1], p[2] * p[2], p[3] * p[3]);
 			else
 				dst(x, y).set(powf(p[0], power[0]), powf(p[1], power[1]), powf(p[2], power[2]), powf(p[3], power[3]));
 		}
 	}
 }

 #if 0
 static void mul_image(const imagef& src, imagef& dst, const vec4F& mul)
 {
 	dst.resize(src);

 //#pragma omp parallel for
 	for (int y = 0; y < (int)dst.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < dst.get_width(); x++)
 		{
 			const vec4F& p = src(x, y);
 			dst(x, y).set(p[0] * mul[0], p[1] * mul[1], p[2] * mul[2], p[3] * mul[3]);
 		}
 	}
 }
 #endif

 static void scale_image(const imagef& src, imagef& dst, const vec4F& scale, const vec4F& shift)
 {
 	dst.resize(src);

 //#pragma omp parallel for
 	for (int y = 0; y < (int)dst.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < dst.get_width(); x++)
 		{
 			const vec4F& p = src(x, y);

 			vec4F d;

 			for (uint32_t c = 0; c < 4; c++)
 				d[c] = scale[c] * p[c] + shift[c];

 			dst(x, y).set(d[0], d[1], d[2], d[3]);
 		}
 	}
 }

 static void add_weighted_image(const imagef& src1, const vec4F& alpha, const imagef& src2, const vec4F& beta, const vec4F& gamma, imagef& dst)
 {
 	dst.resize(src1);

 //#pragma omp parallel for
 	for (int y = 0; y < (int)dst.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < dst.get_width(); x++)
 		{
 			const vec4F& s1 = src1(x, y);
 			const vec4F& s2 = src2(x, y);

 			dst(x, y).set(
 				s1[0] * alpha[0] + s2[0] * beta[0] + gamma[0],
 				s1[1] * alpha[1] + s2[1] * beta[1] + gamma[1],
 				s1[2] * alpha[2] + s2[2] * beta[2] + gamma[2],
 				s1[3] * alpha[3] + s2[3] * beta[3] + gamma[3]);
 		}
 	}
 }

 static void add_image(const imagef& src1, const imagef& src2, imagef& dst)
 {
 	dst.resize(src1);

 //#pragma omp parallel for
 	for (int y = 0; y < (int)dst.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < dst.get_width(); x++)
 		{
 			const vec4F& s1 = src1(x, y);
 			const vec4F& s2 = src2(x, y);

 			dst(x, y).set(s1[0] + s2[0], s1[1] + s2[1], s1[2] + s2[2], s1[3] + s2[3]);
 		}
 	}
 }

 static void adds_image(const imagef& src, const vec4F& value, imagef& dst)
 {
 	dst.resize(src);

 //#pragma omp parallel for
 	for (int y = 0; y < (int)dst.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < dst.get_width(); x++)
 		{
 			const vec4F& p = src(x, y);

 			dst(x, y).set(p[0] + value[0], p[1] + value[1], p[2] + value[2], p[3] + value[3]);
 		}
 	}
 }

 static void mul_image(const imagef& src1, const imagef& src2, imagef& dst, const vec4F& scale)
 {
 	dst.resize(src1);

 //#pragma omp parallel for
 	for (int y = 0; y < (int)dst.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < dst.get_width(); x++)
 		{
 			const vec4F& s1 = src1(x, y);
 			const vec4F& s2 = src2(x, y);

 			vec4F d;

 			for (uint32_t c = 0; c < 4; c++)
 			{
 				float v1 = s1[c];
 				float v2 = s2[c];
 				d[c] = v1 * v2 * scale[c];
 			}

 			dst(x, y) = d;
 		}
 	}
 }

 static void div_image(const imagef& src1, const imagef& src2, imagef& dst, const vec4F& scale)
 {
 	dst.resize(src1);

 //#pragma omp parallel for
 	for (int y = 0; y < (int)dst.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < dst.get_width(); x++)
 		{
 			const vec4F& s1 = src1(x, y);
 			const vec4F& s2 = src2(x, y);

 			vec4F d;

 			for (uint32_t c = 0; c < 4; c++)
 			{
 				float v = s2[c];
 				if (v == 0.0f)
 					d[c] = 0.0f;
 				else
 					d[c] = (s1[c] * scale[c]) / v;
 			}

 			dst(x, y) = d;
 		}
 	}
 }

 static vec4F avg_image(const imagef& src)
 {
 	vec4F avg(0.0f);

 	for (uint32_t y = 0; y < src.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < src.get_width(); x++)
 		{
 			const vec4F& s = src(x, y);

 			avg += vec4F(s[0], s[1], s[2], s[3]);
 		}
 	}

 	avg /= static_cast<float>(src.get_total_pixels());

 	return avg;
 }

 // Reference: https://ece.uwaterloo.ca/~z70wang/research/ssim/index.html
 vec4F compute_ssim(const imagef& a, const imagef& b)
 {
 	imagef axb, a_sq, b_sq, mu1, mu2, mu1_sq, mu2_sq, mu1_mu2, s1_sq, s2_sq, s12, smap, t1, t2, t3;

 	const float C1 = 6.50250f, C2 = 58.52250f;

 	pow_image(a, a_sq, vec4F(2));
 	pow_image(b, b_sq, vec4F(2));
 	mul_image(a, b, axb, vec4F(1.0f));

 	gaussian_filter(mu1, a, 11, 1.5f * 1.5f);
 	gaussian_filter(mu2, b, 11, 1.5f * 1.5f);

 	pow_image(mu1, mu1_sq, vec4F(2));
 	pow_image(mu2, mu2_sq, vec4F(2));
 	mul_image(mu1, mu2, mu1_mu2, vec4F(1.0f));

 	gaussian_filter(s1_sq, a_sq, 11, 1.5f * 1.5f);
 	add_weighted_image(s1_sq, vec4F(1), mu1_sq, vec4F(-1), vec4F(0), s1_sq);

 	gaussian_filter(s2_sq, b_sq, 11, 1.5f * 1.5f);
 	add_weighted_image(s2_sq, vec4F(1), mu2_sq, vec4F(-1), vec4F(0), s2_sq);

 	gaussian_filter(s12, axb, 11, 1.5f * 1.5f);
 	add_weighted_image(s12, vec4F(1), mu1_mu2, vec4F(-1), vec4F(0), s12);

 	scale_image(mu1_mu2, t1, vec4F(2), vec4F(0));
 	adds_image(t1, vec4F(C1), t1);

 	scale_image(s12, t2, vec4F(2), vec4F(0));
 	adds_image(t2, vec4F(C2), t2);

 	mul_image(t1, t2, t3, vec4F(1));

 	add_image(mu1_sq, mu2_sq, t1);
 	adds_image(t1, vec4F(C1), t1);

 	add_image(s1_sq, s2_sq, t2);
 	adds_image(t2, vec4F(C2), t2);

 	mul_image(t1, t2, t1, vec4F(1));

 	div_image(t3, t1, smap, vec4F(1));

 	return avg_image(smap);
 }

 vec4F compute_ssim(const image_u8& a, const image_u8& b, bool luma)
 {
 	image_u8 ta(a), tb(b);

 	if ((ta.width() != tb.width()) || (ta.height() != tb.height()))
 	{
 		fprintf(stderr, "compute_ssim: Cropping input images to equal dimensions\n");

 		const uint32_t w = std::min(a.width(), b.width());
 		const uint32_t h = std::min(a.height(), b.height());
 		ta.crop(w, h);
 		tb.crop(w, h);
 	}

 	if (!ta.width() || !ta.height())
 	{
 		assert(0);
 		return vec4F(0);
 	}

 	if (luma)
 	{
 		for (uint32_t y = 0; y < ta.height(); y++)
 		{
 			for (uint32_t x = 0; x < ta.width(); x++)
 			{
 				ta(x, y).set((uint8_t)ta(x, y).get_luma(), ta(x, y).a);
 				tb(x, y).set((uint8_t)tb(x, y).get_luma(), tb(x, y).a);
 			}
 		}
 	}

 	imagef fta, ftb;

 	fta.set(ta);
 	ftb.set(tb);

 	return compute_ssim(fta, ftb);
 }

 bool save_dds(const char* pFilename, uint32_t width, uint32_t height, const void* pBlocks, uint32_t pixel_format_bpp, DXGI_FORMAT dxgi_format, bool srgb, bool force_dx10_header)
 {
 	(void)srgb;

 	FILE* pFile = NULL;
 #ifdef _MSC_VER
 	fopen_s(&pFile, pFilename, "wb");
 #else
 	pFile = fopen(pFilename, "wb");
 #endif
 	if (!pFile)
 	{
 		fprintf(stderr, "Failed creating file %s!\n", pFilename);
 		return false;
 	}

 	fwrite("DDS ", 4, 1, pFile);

 	DDSURFACEDESC2 desc;
 	memset(&desc, 0, sizeof(desc));

 	desc.dwSize = sizeof(desc);
 	desc.dwFlags = DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT | DDSD_CAPS;

 	desc.dwWidth = width;
 	desc.dwHeight = height;

 	desc.ddsCaps.dwCaps = DDSCAPS_TEXTURE;
 	desc.ddpfPixelFormat.dwSize = sizeof(desc.ddpfPixelFormat);

 	desc.ddpfPixelFormat.dwFlags |= DDPF_FOURCC;

 	desc.lPitch = (((desc.dwWidth + 3) & ~3) * ((desc.dwHeight + 3) & ~3) * pixel_format_bpp) >> 3;
 	desc.dwFlags |= DDSD_LINEARSIZE;

 	desc.ddpfPixelFormat.dwRGBBitCount = 0;

 	if ((!force_dx10_header) &&
 		((dxgi_format == DXGI_FORMAT_BC1_UNORM) ||
 			(dxgi_format == DXGI_FORMAT_BC3_UNORM) ||
 			(dxgi_format == DXGI_FORMAT_BC4_UNORM) ||
 			(dxgi_format == DXGI_FORMAT_BC5_UNORM)))
 	{
 		if (dxgi_format == DXGI_FORMAT_BC1_UNORM)
 			desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', 'T', '1');
 		else if (dxgi_format == DXGI_FORMAT_BC3_UNORM)
 			desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', 'T', '5');
 		else if (dxgi_format == DXGI_FORMAT_BC4_UNORM)
 			desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('A', 'T', 'I', '1');
 		else if (dxgi_format == DXGI_FORMAT_BC5_UNORM)
 			desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('A', 'T', 'I', '2');

 		fwrite(&desc, sizeof(desc), 1, pFile);
 	}
 	else
 	{
 		desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', '1', '0');

 		fwrite(&desc, sizeof(desc), 1, pFile);

 		DDS_HEADER_DXT10 hdr10;
 		memset(&hdr10, 0, sizeof(hdr10));

 		// Not all tools support DXGI_FORMAT_BC7_UNORM_SRGB (like NVTT), but ddsview in DirectXTex pays attention to it. So not sure what to do here.
 		// For best compatibility just write DXGI_FORMAT_BC7_UNORM.
 		//hdr10.dxgiFormat = srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM;
 		hdr10.dxgiFormat = dxgi_format; // DXGI_FORMAT_BC7_UNORM;
 		hdr10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
 		hdr10.arraySize = 1;

 		fwrite(&hdr10, sizeof(hdr10), 1, pFile);
 	}

 	fwrite(pBlocks, desc.lPitch, 1, pFile);

 	if (fclose(pFile) == EOF)
 	{
 		fprintf(stderr, "Failed writing to DDS file %s!\n", pFilename);
 		return false;
 	}

 	return true;
 }

 void strip_extension(std::string& s)
 {
 	for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--)
 	{
 		if (s[i] == '.')
 		{
 			s.resize(i);
 			break;
 		}
 	}
 }

 void strip_path(std::string& s)
 {
 	for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--)
 	{
 		if ((s[i] == '/') || (s[i] == ':') || (s[i] == '\\'))
 		{
 			s.erase(0, i + 1);
 			break;
 		}
 	}
 }

 uint32_t hash_hsieh(const uint8_t* pBuf, size_t len)
 {
 	if (!pBuf || !len)
 		return 0;

 	uint32_t h = static_cast<uint32_t>(len);

 	const uint32_t bytes_left = len & 3;
 	len >>= 2;

 	while (len--)
 	{
 		const uint16_t* pWords = reinterpret_cast<const uint16_t*>(pBuf);

 		h += pWords[0];

 		const uint32_t t = (pWords[1] << 11) ^ h;
 		h = (h << 16) ^ t;

 		pBuf += sizeof(uint32_t);

 		h += h >> 11;
 	}

 	switch (bytes_left)
 	{
 	case 1:
 		h += *reinterpret_cast<const signed char*>(pBuf);
 		h ^= h << 10;
 		h += h >> 1;
 		break;
 	case 2:
 		h += *reinterpret_cast<const uint16_t*>(pBuf);
 		h ^= h << 11;
 		h += h >> 17;
 		break;
 	case 3:
 		h += *reinterpret_cast<const uint16_t*>(pBuf);
 		h ^= h << 16;
 		h ^= (static_cast<signed char>(pBuf[sizeof(uint16_t)])) << 18;
 		h += h >> 11;
 		break;
 	default:
 		break;
 	}

 	h ^= h << 3;
 	h += h >> 5;
 	h ^= h << 4;
 	h += h >> 17;
 	h ^= h << 25;
 	h += h >> 6;

 	return h;
 }

 float compute_block_max_std_dev(const color_quad_u8* pPixels, uint32_t block_width, uint32_t block_height, uint32_t num_comps)
 {
 	tracked_stat comp_stats[4];

 	for (uint32_t y = 0; y < block_height; y++)
 	{
 		for (uint32_t x = 0; x < block_width; x++)
 		{
 			const color_quad_u8* pPixel = pPixels + x + y * block_width;

 			for (uint32_t c = 0; c < num_comps; c++)
 				comp_stats[c].update(pPixel->m_c[c]);
 		}
 	}

 	float max_std_dev = 0.0f;
 	for (uint32_t i = 0; i < num_comps; i++)
 		max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev());
 	return max_std_dev;
 }

 const uint32_t ASTC_SIG = 0x5CA1AB13;

 #pragma pack(push, 1)
 struct astc_header
 {
 	uint32_t m_sig;
 	uint8_t m_block_x;
 	uint8_t m_block_y;
 	uint8_t m_block_z;
 	uint8_t m_width[3];
 	uint8_t m_height[3];
 	uint8_t m_depth[3];
 };
 #pragma pack(pop)

 bool save_astc_file(const char* pFilename, block16_vec& blocks, uint32_t width, uint32_t height, uint32_t block_width, uint32_t block_height)
 {
 	FILE* pFile = nullptr;

 #ifdef _MSC_VER
 	fopen_s(&pFile, pFilename, "wb");
 #else
 	pFile = fopen(pFilename, "wb");
 #endif

 	if (!pFile)
 		return false;

 	astc_header hdr;
 	memset(&hdr, 0, sizeof(hdr));

 	hdr.m_sig = ASTC_SIG;
 	hdr.m_block_x = (uint8_t)block_width;
 	hdr.m_block_y = (uint8_t)block_height;
 	hdr.m_block_z = 1;
 	hdr.m_width[0] = (uint8_t)(width);
 	hdr.m_width[1] = (uint8_t)(width >> 8);
 	hdr.m_width[2] = (uint8_t)(width >> 16);
 	hdr.m_height[0] = (uint8_t)(height);
 	hdr.m_height[1] = (uint8_t)(height >> 8);
 	hdr.m_height[2] = (uint8_t)(height >> 16);
 	hdr.m_depth[0] = 1;
 	fwrite(&hdr, sizeof(hdr), 1, pFile);

 	fwrite(blocks.data(), 16, blocks.size(), pFile);
 	if (fclose(pFile) == EOF)
 		return false;

 	return true;
 }

 bool load_astc_file(const char* pFilename, block16_vec& blocks, uint32_t& width, uint32_t& height, uint32_t& block_width, uint32_t& block_height)
 {
 	FILE* pFile = nullptr;

 #ifdef _MSC_VER
 	fopen_s(&pFile, pFilename, "rb");
 #else
 	pFile = fopen(pFilename, "rb");
 #endif

 	if (!pFile)
 		return false;

 	astc_header hdr;
 	if (fread(&hdr, sizeof(hdr), 1, pFile) != 1)
 	{
 		fclose(pFile);
 		return false;
 	}

 	if (hdr.m_sig != ASTC_SIG)
 	{
 		fclose(pFile);
 		return false;
 	}

 	width = hdr.m_width[0] + (hdr.m_width[1] << 8) + (hdr.m_width[2] << 16);
 	height = hdr.m_height[0] + (hdr.m_height[1] << 8) + (hdr.m_height[2] << 16);
 	uint32_t depth = hdr.m_depth[0] + (hdr.m_depth[1] << 8) + (hdr.m_depth[2] << 16);

 	if ((width < 1) || (width > 32768) || (height < 1) || (height > 32768))
 		return false;
 	if ((hdr.m_block_z != 1) || (depth != 1))
 		return false;

 	block_width = hdr.m_block_x;
 	block_height = hdr.m_block_y;

 	if ((block_width < 4) || (block_width > 12) || (block_height < 4) || (block_height > 12))
 		return false;

 	uint32_t blocks_x = (width + block_width - 1) / block_width;
 	uint32_t blocks_y = (height + block_height - 1) / block_height;
 	uint32_t total_blocks = blocks_x * blocks_y;

 	blocks.resize(total_blocks);

 	if (fread(blocks.data(), 16, total_blocks, pFile) != total_blocks)
 	{
 		fclose(pFile);
 		return false;
 	}

 	fclose(pFile);
 	return true;
 }

 #if 0
 uint32_t get_deflate_size(const void* pData, size_t data_size)
 {
 	size_t comp_size = 0;
 	void* pPre_RDO_Comp_data = tdefl_compress_mem_to_heap(pData, data_size, &comp_size, TDEFL_MAX_PROBES_MASK);// TDEFL_DEFAULT_MAX_PROBES);
 	mz_free(pPre_RDO_Comp_data);

 	if (comp_size > UINT32_MAX)
 		return UINT32_MAX;

 	return (uint32_t)comp_size;
 }
 #endif

 bool read_file(const char* pFilename, uint8_vec& buf)
 {
 	buf.resize(0);

 	FILE* pFile = nullptr;
 #if _MSC_VER
 	fopen_s(&pFile, pFilename, "rb");
 #else
 	pFile = fopen(pFilename, "rb");
 #endif
 	if (!pFile)
 		return false;

 	fseek(pFile, 0, SEEK_END);

 	long file_end_ofs = ftell(pFile);
 	if (file_end_ofs <= 0)
 	{
 		fclose(pFile);
 		return false;
 	}

 	size_t sz = static_cast<size_t>(file_end_ofs);
 	if (sz != (unsigned long)file_end_ofs)
 	{
 		fclose(pFile);
 		return false;
 	}

 	fseek(pFile, 0, SEEK_SET);

 	buf.resize(sz);

 	if (fread(buf.data(), sizeof(uint8_t), sz, pFile) != sz)
 	{
 		fclose(pFile);
 		return false;
 	}

 	fclose(pFile);
 	return true;
 }

 } // namespace utils
	// File: utils.cpp
	#include "utils.h"
	//#include "lodepng.h"
	//#include "miniz.h"

	namespace utils
	{

	#define FLOOD_PUSH(y, xl, xr, dy) if (((y + (dy)) >= 0) && ((y + (dy)) < (int)m_height)) { stack.push_back(fill_segment(y, xl, xr, dy)); }

	// See http://www.realtimerendering.com/resources/GraphicsGems/gems/SeedFill.c
	uint32_t image_u8::flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, std::vector<pixel_coord>* pSet_pixels)
	{
	uint32_t total_set = 0;

	if (!flood_fill_is_inside(x, y, b))
	return 0;

	std::vector<fill_segment> stack;
	stack.reserve(64);

	FLOOD_PUSH(y, x, x, 1);
	FLOOD_PUSH(y + 1, x, x, -1);

	while (stack.size())
	{
	fill_segment s = stack.back();
	stack.pop_back();

	int x1 = s.m_xl, x2 = s.m_xr, dy = s.m_dy;
	y = s.m_y + s.m_dy;

	for (x = x1; (x >= 0) && flood_fill_is_inside(x, y, b); x--)
	{
	(*this)(x, y) = c;
	total_set++;
	if (pSet_pixels)
	pSet_pixels->push_back(pixel_coord(x, y));
	}

	int l;

	if (x >= x1)
	goto skip;

	l = x + 1;
	if (l < x1)
	FLOOD_PUSH(y, l, x1 - 1, -dy);

	x = x1 + 1;

	do
	{
	for (; x <= ((int)m_width - 1) && flood_fill_is_inside(x, y, b); x++)
	{
	(*this)(x, y) = c;
	total_set++;
	if (pSet_pixels)
	pSet_pixels->push_back(pixel_coord(x, y));
	}
	FLOOD_PUSH(y, l, x - 1, dy);

	if (x > (x2 + 1))
	FLOOD_PUSH(y, x2 + 1, x - 1, -dy);

	skip:
	for (x++; x <= x2 && !flood_fill_is_inside(x, y, b); x++)
	;

	l = x;
	} while (x <= x2);
	}

	return total_set;
	}

	void image_u8::draw_line(int xs, int ys, int xe, int ye, const color_quad_u8& color)
	{
	if (xs > xe)
	{
	std::swap(xs, xe);
	std::swap(ys, ye);
	}

	int dx = xe - xs, dy = ye - ys;
	if (!dx)
	{
	if (ys > ye)
	std::swap(ys, ye);
	for (int i = ys; i <= ye; i++)
	set_pixel_clipped(xs, i, color);
	}
	else if (!dy)
	{
	for (int i = xs; i < xe; i++)
	set_pixel_clipped(i, ys, color);
	}
	else if (dy > 0)
	{
	if (dy <= dx)
	{
	int e = 2 * dy - dx, e_no_inc = 2 * dy, e_inc = 2 * (dy - dx);
	rasterize_line(xs, ys, xe, ye, 0, 1, e, e_inc, e_no_inc, color);
	}
	else
	{
	int e = 2 * dx - dy, e_no_inc = 2 * dx, e_inc = 2 * (dx - dy);
	rasterize_line(xs, ys, xe, ye, 1, 1, e, e_inc, e_no_inc, color);
	}
	}
	else
	{
	dy = -dy;
	if (dy <= dx)
	{
	int e = 2 * dy - dx, e_no_inc = 2 * dy, e_inc = 2 * (dy - dx);
	rasterize_line(xs, ys, xe, ye, 0, -1, e, e_inc, e_no_inc, color);
	}
	else
	{
	int e = 2 * dx - dy, e_no_inc = (2 * dx), e_inc = 2 * (dx - dy);
	rasterize_line(xe, ye, xs, ys, 1, -1, e, e_inc, e_no_inc, color);
	}
	}
	}

	void image_u8::rasterize_line(int xs, int ys, int xe, int ye, int pred, int inc_dec, int e, int e_inc, int e_no_inc, const color_quad_u8& color)
	{
	int start, end, var;

	if (pred)
	{
	start = ys;
	end = ye;
	var = xs;
	for (int i = start; i <= end; i++)
	{
	set_pixel_clipped(var, i, color);
	if (e < 0)
	e += e_no_inc;
	else
	{
	var += inc_dec;
	e += e_inc;
	}
	}
	}
	else
	{
	start = xs;
	end = xe;
	var = ys;
	for (int i = start; i <= end; i++)
	{
	set_pixel_clipped(i, var, color);
	if (e < 0)
	e += e_no_inc;
	else
	{
	var += inc_dec;
	e += e_inc;
	}
	}
	}
	}

	#if 0
	bool load_png(const char* pFilename, image_u8& img)
	{
	img.clear();

	std::vector<unsigned char> pixels;
	unsigned int w = 0, h = 0;
	unsigned int e = lodepng::decode(pixels, w, h, pFilename);
	if (e != 0)
	{
	fprintf(stderr, "Failed loading PNG file %s\n", pFilename);
	return false;
	}

	img.init(w, h);
	memcpy(&img.get_pixels()[0], &pixels[0], w * h * sizeof(uint32_t));

	return true;
	}

	bool save_png(const char* pFilename, const image_u8& img, bool save_alpha)
	{
	const uint32_t w = img.width();
	const uint32_t h = img.height();

	std::vector<unsigned char> pixels;
	if (save_alpha)
	{
	pixels.resize(w * h * sizeof(color_quad_u8));
	memcpy(&pixels[0], &img.get_pixels()[0], w * h * sizeof(color_quad_u8));
	}
	else
	{
	pixels.resize(w * h * 3);
	unsigned char* pDst = &pixels[0];
	for (uint32_t y = 0; y < h; y++)
	for (uint32_t x = 0; x < w; x++, pDst += 3)
	pDst[0] = img(x, y)[0], pDst[1] = img(x, y)[1], pDst[2] = img(x, y)[2];
	}

	return lodepng::encode(pFilename, pixels, w, h, save_alpha ? LCT_RGBA : LCT_RGB) == 0;
	}
	#endif

	static float gauss(int x, int y, float sigma_sqr)
	{
	float pow = expf(-((x * x + y * y) / (2.0f * sigma_sqr)));
	float g = (1.0f / (sqrtf((float)(2.0f * M_PI * sigma_sqr)))) * pow;
	return g;
	}

	// size_x/y should be odd
	void compute_gaussian_kernel(float* pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags)
	{
	assert(size_x & size_y & 1);

	if (!(size_x \| size_y))
	return;

	int mid_x = size_x / 2;
	int mid_y = size_y / 2;

	double sum = 0;
	for (int x = 0; x < size_x; x++)
	{
	for (int y = 0; y < size_y; y++)
	{
	float g;
	if ((x > mid_x) && (y < mid_y))
	g = pDst[(size_x - x - 1) + y * size_x];
	else if ((x < mid_x) && (y > mid_y))
	g = pDst[x + (size_y - y - 1) * size_x];
	else if ((x > mid_x) && (y > mid_y))
	g = pDst[(size_x - x - 1) + (size_y - y - 1) * size_x];
	else
	g = gauss(x - mid_x, y - mid_y, sigma_sqr);

	pDst[x + y * size_x] = g;
	sum += g;
	}
	}

	if (flags & cComputeGaussianFlagNormalizeCenterToOne)
	{
	sum = pDst[mid_x + mid_y * size_x];
	}

	if (flags & (cComputeGaussianFlagNormalizeCenterToOne \| cComputeGaussianFlagNormalize))
	{
	double one_over_sum = 1.0f / sum;
	for (int i = 0; i < size_x * size_y; i++)
	pDst[i] = static_cast<float>(pDst[i] * one_over_sum);

	if (flags & cComputeGaussianFlagNormalizeCenterToOne)
	pDst[mid_x + mid_y * size_x] = 1.0f;
	}

	if (flags & cComputeGaussianFlagPrint)
	{
	printf("{\n");
	for (int y = 0; y < size_y; y++)
	{
	printf(" ");
	for (int x = 0; x < size_x; x++)
	{
	printf("%f, ", pDst[x + y * size_x]);
	}
	printf("\n");
	}
	printf("}");
	}
	}

	void gaussian_filter(imagef& dst, const imagef& orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor)
	{
	assert(odd_filter_width && (odd_filter_width & 1));
	odd_filter_width \|= 1;

	std::vector<float> kernel(odd_filter_width * odd_filter_width);
	compute_gaussian_kernel(&kernel[0], odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize);

	const int dst_width = orig_img.get_width() / width_divisor;
	const int dst_height = orig_img.get_height() / height_divisor;

	const int H = odd_filter_width / 2;
	const int L = -H;

	dst.crop(dst_width, dst_height);

	//#pragma omp parallel for
	for (int oy = 0; oy < dst_height; oy++)
	{
	for (int ox = 0; ox < dst_width; ox++)
	{
	vec4F c(0.0f);

	for (int yd = L; yd <= H; yd++)
	{
	int y = oy * height_divisor + (height_divisor >> 1) + yd;

	for (int xd = L; xd <= H; xd++)
	{
	int x = ox * width_divisor + (width_divisor >> 1) + xd;

	const vec4F& p = orig_img.get_clamped_or_wrapped(x, y, wrapping, wrapping);

	float w = kernel[(xd + H) + (yd + H) * odd_filter_width];
	c[0] += p[0] * w;
	c[1] += p[1] * w;
	c[2] += p[2] * w;
	c[3] += p[3] * w;
	}
	}

	dst(ox, oy).set(c[0], c[1], c[2], c[3]);
	}
	}
	}

	static void pow_image(const imagef& src, imagef& dst, const vec4F& power)
	{
	dst.resize(src);

	//#pragma omp parallel for
	for (int y = 0; y < (int)dst.get_height(); y++)
	{
	for (uint32_t x = 0; x < dst.get_width(); x++)
	{
	const vec4F& p = src(x, y);

	if ((power[0] == 2.0f) && (power[1] == 2.0f) && (power[2] == 2.0f) && (power[3] == 2.0f))
	dst(x, y).set(p[0] * p[0], p[1] * p[1], p[2] * p[2], p[3] * p[3]);
	else
	dst(x, y).set(powf(p[0], power[0]), powf(p[1], power[1]), powf(p[2], power[2]), powf(p[3], power[3]));
	}
	}
	}

	#if 0
	static void mul_image(const imagef& src, imagef& dst, const vec4F& mul)
	{
	dst.resize(src);

	//#pragma omp parallel for
	for (int y = 0; y < (int)dst.get_height(); y++)
	{
	for (uint32_t x = 0; x < dst.get_width(); x++)
	{
	const vec4F& p = src(x, y);
	dst(x, y).set(p[0] * mul[0], p[1] * mul[1], p[2] * mul[2], p[3] * mul[3]);
	}
	}
	}
	#endif

	static void scale_image(const imagef& src, imagef& dst, const vec4F& scale, const vec4F& shift)
	{
	dst.resize(src);

	//#pragma omp parallel for
	for (int y = 0; y < (int)dst.get_height(); y++)
	{
	for (uint32_t x = 0; x < dst.get_width(); x++)
	{
	const vec4F& p = src(x, y);

	vec4F d;

	for (uint32_t c = 0; c < 4; c++)
	d[c] = scale[c] * p[c] + shift[c];

	dst(x, y).set(d[0], d[1], d[2], d[3]);
	}
	}
	}

	static void add_weighted_image(const imagef& src1, const vec4F& alpha, const imagef& src2, const vec4F& beta, const vec4F& gamma, imagef& dst)
	{
	dst.resize(src1);

	//#pragma omp parallel for
	for (int y = 0; y < (int)dst.get_height(); y++)
	{
	for (uint32_t x = 0; x < dst.get_width(); x++)
	{
	const vec4F& s1 = src1(x, y);
	const vec4F& s2 = src2(x, y);

	dst(x, y).set(
	s1[0] * alpha[0] + s2[0] * beta[0] + gamma[0],
	s1[1] * alpha[1] + s2[1] * beta[1] + gamma[1],
	s1[2] * alpha[2] + s2[2] * beta[2] + gamma[2],
	s1[3] * alpha[3] + s2[3] * beta[3] + gamma[3]);
	}
	}
	}

	static void add_image(const imagef& src1, const imagef& src2, imagef& dst)
	{
	dst.resize(src1);

	//#pragma omp parallel for
	for (int y = 0; y < (int)dst.get_height(); y++)
	{
	for (uint32_t x = 0; x < dst.get_width(); x++)
	{
	const vec4F& s1 = src1(x, y);
	const vec4F& s2 = src2(x, y);

	dst(x, y).set(s1[0] + s2[0], s1[1] + s2[1], s1[2] + s2[2], s1[3] + s2[3]);
	}
	}
	}

	static void adds_image(const imagef& src, const vec4F& value, imagef& dst)
	{
	dst.resize(src);

	//#pragma omp parallel for
	for (int y = 0; y < (int)dst.get_height(); y++)
	{
	for (uint32_t x = 0; x < dst.get_width(); x++)
	{
	const vec4F& p = src(x, y);

	dst(x, y).set(p[0] + value[0], p[1] + value[1], p[2] + value[2], p[3] + value[3]);
	}
	}
	}

	static void mul_image(const imagef& src1, const imagef& src2, imagef& dst, const vec4F& scale)
	{
	dst.resize(src1);

	//#pragma omp parallel for
	for (int y = 0; y < (int)dst.get_height(); y++)
	{
	for (uint32_t x = 0; x < dst.get_width(); x++)
	{
	const vec4F& s1 = src1(x, y);
	const vec4F& s2 = src2(x, y);

	vec4F d;

	for (uint32_t c = 0; c < 4; c++)
	{
	float v1 = s1[c];
	float v2 = s2[c];
	d[c] = v1 * v2 * scale[c];
	}

	dst(x, y) = d;
	}
	}
	}

	static void div_image(const imagef& src1, const imagef& src2, imagef& dst, const vec4F& scale)
	{
	dst.resize(src1);

	//#pragma omp parallel for
	for (int y = 0; y < (int)dst.get_height(); y++)
	{
	for (uint32_t x = 0; x < dst.get_width(); x++)
	{
	const vec4F& s1 = src1(x, y);
	const vec4F& s2 = src2(x, y);

	vec4F d;

	for (uint32_t c = 0; c < 4; c++)
	{
	float v = s2[c];
	if (v == 0.0f)
	d[c] = 0.0f;
	else
	d[c] = (s1[c] * scale[c]) / v;
	}

	dst(x, y) = d;
	}
	}
	}

	static vec4F avg_image(const imagef& src)
	{
	vec4F avg(0.0f);

	for (uint32_t y = 0; y < src.get_height(); y++)
	{
	for (uint32_t x = 0; x < src.get_width(); x++)
	{
	const vec4F& s = src(x, y);

	avg += vec4F(s[0], s[1], s[2], s[3]);
	}
	}

	avg /= static_cast<float>(src.get_total_pixels());

	return avg;
	}

	// Reference: https://ece.uwaterloo.ca/~z70wang/research/ssim/index.html
	vec4F compute_ssim(const imagef& a, const imagef& b)
	{
	imagef axb, a_sq, b_sq, mu1, mu2, mu1_sq, mu2_sq, mu1_mu2, s1_sq, s2_sq, s12, smap, t1, t2, t3;

	const float C1 = 6.50250f, C2 = 58.52250f;

	pow_image(a, a_sq, vec4F(2));
	pow_image(b, b_sq, vec4F(2));
	mul_image(a, b, axb, vec4F(1.0f));

	gaussian_filter(mu1, a, 11, 1.5f * 1.5f);
	gaussian_filter(mu2, b, 11, 1.5f * 1.5f);

	pow_image(mu1, mu1_sq, vec4F(2));
	pow_image(mu2, mu2_sq, vec4F(2));
	mul_image(mu1, mu2, mu1_mu2, vec4F(1.0f));

	gaussian_filter(s1_sq, a_sq, 11, 1.5f * 1.5f);
	add_weighted_image(s1_sq, vec4F(1), mu1_sq, vec4F(-1), vec4F(0), s1_sq);

	gaussian_filter(s2_sq, b_sq, 11, 1.5f * 1.5f);
	add_weighted_image(s2_sq, vec4F(1), mu2_sq, vec4F(-1), vec4F(0), s2_sq);

	gaussian_filter(s12, axb, 11, 1.5f * 1.5f);
	add_weighted_image(s12, vec4F(1), mu1_mu2, vec4F(-1), vec4F(0), s12);

	scale_image(mu1_mu2, t1, vec4F(2), vec4F(0));
	adds_image(t1, vec4F(C1), t1);

	scale_image(s12, t2, vec4F(2), vec4F(0));
	adds_image(t2, vec4F(C2), t2);

	mul_image(t1, t2, t3, vec4F(1));

	add_image(mu1_sq, mu2_sq, t1);
	adds_image(t1, vec4F(C1), t1);

	add_image(s1_sq, s2_sq, t2);
	adds_image(t2, vec4F(C2), t2);

	mul_image(t1, t2, t1, vec4F(1));

	div_image(t3, t1, smap, vec4F(1));

	return avg_image(smap);
	}

	vec4F compute_ssim(const image_u8& a, const image_u8& b, bool luma)
	{
	image_u8 ta(a), tb(b);

	if ((ta.width() != tb.width()) \|\| (ta.height() != tb.height()))
	{
	fprintf(stderr, "compute_ssim: Cropping input images to equal dimensions\n");

	const uint32_t w = std::min(a.width(), b.width());
	const uint32_t h = std::min(a.height(), b.height());
	ta.crop(w, h);
	tb.crop(w, h);
	}

	if (!ta.width() \|\| !ta.height())
	{
	assert(0);
	return vec4F(0);
	}

	if (luma)
	{
	for (uint32_t y = 0; y < ta.height(); y++)
	{
	for (uint32_t x = 0; x < ta.width(); x++)
	{
	ta(x, y).set((uint8_t)ta(x, y).get_luma(), ta(x, y).a);
	tb(x, y).set((uint8_t)tb(x, y).get_luma(), tb(x, y).a);
	}
	}
	}

	imagef fta, ftb;

	fta.set(ta);
	ftb.set(tb);

	return compute_ssim(fta, ftb);
	}

	bool save_dds(const char* pFilename, uint32_t width, uint32_t height, const void* pBlocks, uint32_t pixel_format_bpp, DXGI_FORMAT dxgi_format, bool srgb, bool force_dx10_header)
	{
	(void)srgb;

	FILE* pFile = NULL;
	#ifdef _MSC_VER
	fopen_s(&pFile, pFilename, "wb");
	#else
	pFile = fopen(pFilename, "wb");
	#endif
	if (!pFile)
	{
	fprintf(stderr, "Failed creating file %s!\n", pFilename);
	return false;
	}

	fwrite("DDS ", 4, 1, pFile);

	DDSURFACEDESC2 desc;
	memset(&desc, 0, sizeof(desc));

	desc.dwSize = sizeof(desc);
	desc.dwFlags = DDSD_WIDTH \| DDSD_HEIGHT \| DDSD_PIXELFORMAT \| DDSD_CAPS;

	desc.dwWidth = width;
	desc.dwHeight = height;

	desc.ddsCaps.dwCaps = DDSCAPS_TEXTURE;
	desc.ddpfPixelFormat.dwSize = sizeof(desc.ddpfPixelFormat);

	desc.ddpfPixelFormat.dwFlags \|= DDPF_FOURCC;

	desc.lPitch = (((desc.dwWidth + 3) & ~3) * ((desc.dwHeight + 3) & ~3) * pixel_format_bpp) >> 3;
	desc.dwFlags \|= DDSD_LINEARSIZE;

	desc.ddpfPixelFormat.dwRGBBitCount = 0;

	if ((!force_dx10_header) &&
	((dxgi_format == DXGI_FORMAT_BC1_UNORM) \|\|
	(dxgi_format == DXGI_FORMAT_BC3_UNORM) \|\|
	(dxgi_format == DXGI_FORMAT_BC4_UNORM) \|\|
	(dxgi_format == DXGI_FORMAT_BC5_UNORM)))
	{
	if (dxgi_format == DXGI_FORMAT_BC1_UNORM)
	desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', 'T', '1');
	else if (dxgi_format == DXGI_FORMAT_BC3_UNORM)
	desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', 'T', '5');
	else if (dxgi_format == DXGI_FORMAT_BC4_UNORM)
	desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('A', 'T', 'I', '1');
	else if (dxgi_format == DXGI_FORMAT_BC5_UNORM)
	desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('A', 'T', 'I', '2');

	fwrite(&desc, sizeof(desc), 1, pFile);
	}
	else
	{
	desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', '1', '0');

	fwrite(&desc, sizeof(desc), 1, pFile);

	DDS_HEADER_DXT10 hdr10;
	memset(&hdr10, 0, sizeof(hdr10));

	// Not all tools support DXGI_FORMAT_BC7_UNORM_SRGB (like NVTT), but ddsview in DirectXTex pays attention to it. So not sure what to do here.
	// For best compatibility just write DXGI_FORMAT_BC7_UNORM.
	//hdr10.dxgiFormat = srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM;
	hdr10.dxgiFormat = dxgi_format; // DXGI_FORMAT_BC7_UNORM;
	hdr10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
	hdr10.arraySize = 1;

	fwrite(&hdr10, sizeof(hdr10), 1, pFile);
	}

	fwrite(pBlocks, desc.lPitch, 1, pFile);

	if (fclose(pFile) == EOF)
	{
	fprintf(stderr, "Failed writing to DDS file %s!\n", pFilename);
	return false;
	}

	return true;
	}

	void strip_extension(std::string& s)
	{
	for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--)
	{
	if (s[i] == '.')
	{
	s.resize(i);
	break;
	}
	}
	}

	void strip_path(std::string& s)
	{
	for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--)
	{
	if ((s[i] == '/') \|\| (s[i] == ':') \|\| (s[i] == '\\'))
	{
	s.erase(0, i + 1);
	break;
	}
	}
	}

	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len)
	{
	if (!pBuf \|\| !len)
	return 0;

	uint32_t h = static_cast<uint32_t>(len);

	const uint32_t bytes_left = len & 3;
	len >>= 2;

	while (len--)
	{
	const uint16_t* pWords = reinterpret_cast<const uint16_t*>(pBuf);

	h += pWords[0];

	const uint32_t t = (pWords[1] << 11) ^ h;
	h = (h << 16) ^ t;

	pBuf += sizeof(uint32_t);

	h += h >> 11;
	}

	switch (bytes_left)
	{
	case 1:
	h += reinterpret_cast<const signed char>(pBuf);
	h ^= h << 10;
	h += h >> 1;
	break;
	case 2:
	h += reinterpret_cast<const uint16_t>(pBuf);
	h ^= h << 11;
	h += h >> 17;
	break;
	case 3:
	h += reinterpret_cast<const uint16_t>(pBuf);
	h ^= h << 16;
	h ^= (static_cast<signed char>(pBuf[sizeof(uint16_t)])) << 18;
	h += h >> 11;
	break;
	default:
	break;
	}

	h ^= h << 3;
	h += h >> 5;
	h ^= h << 4;
	h += h >> 17;
	h ^= h << 25;
	h += h >> 6;

	return h;
	}

	float compute_block_max_std_dev(const color_quad_u8* pPixels, uint32_t block_width, uint32_t block_height, uint32_t num_comps)
	{
	tracked_stat comp_stats[4];

	for (uint32_t y = 0; y < block_height; y++)
	{
	for (uint32_t x = 0; x < block_width; x++)
	{
	const color_quad_u8* pPixel = pPixels + x + y * block_width;

	for (uint32_t c = 0; c < num_comps; c++)
	comp_stats[c].update(pPixel->m_c[c]);
	}
	}

	float max_std_dev = 0.0f;
	for (uint32_t i = 0; i < num_comps; i++)
	max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev());
	return max_std_dev;
	}

	const uint32_t ASTC_SIG = 0x5CA1AB13;

	#pragma pack(push, 1)
	struct astc_header
	{
	uint32_t m_sig;
	uint8_t m_block_x;
	uint8_t m_block_y;
	uint8_t m_block_z;
	uint8_t m_width[3];
	uint8_t m_height[3];
	uint8_t m_depth[3];
	};
	#pragma pack(pop)

	bool save_astc_file(const char* pFilename, block16_vec& blocks, uint32_t width, uint32_t height, uint32_t block_width, uint32_t block_height)
	{
	FILE* pFile = nullptr;

	#ifdef _MSC_VER
	fopen_s(&pFile, pFilename, "wb");
	#else
	pFile = fopen(pFilename, "wb");
	#endif

	if (!pFile)
	return false;

	astc_header hdr;
	memset(&hdr, 0, sizeof(hdr));

	hdr.m_sig = ASTC_SIG;
	hdr.m_block_x = (uint8_t)block_width;
	hdr.m_block_y = (uint8_t)block_height;
	hdr.m_block_z = 1;
	hdr.m_width[0] = (uint8_t)(width);
	hdr.m_width[1] = (uint8_t)(width >> 8);
	hdr.m_width[2] = (uint8_t)(width >> 16);
	hdr.m_height[0] = (uint8_t)(height);
	hdr.m_height[1] = (uint8_t)(height >> 8);
	hdr.m_height[2] = (uint8_t)(height >> 16);
	hdr.m_depth[0] = 1;
	fwrite(&hdr, sizeof(hdr), 1, pFile);

	fwrite(blocks.data(), 16, blocks.size(), pFile);
	if (fclose(pFile) == EOF)
	return false;

	return true;
	}

	bool load_astc_file(const char* pFilename, block16_vec& blocks, uint32_t& width, uint32_t& height, uint32_t& block_width, uint32_t& block_height)
	{
	FILE* pFile = nullptr;

	#ifdef _MSC_VER
	fopen_s(&pFile, pFilename, "rb");
	#else
	pFile = fopen(pFilename, "rb");
	#endif

	if (!pFile)
	return false;

	astc_header hdr;
	if (fread(&hdr, sizeof(hdr), 1, pFile) != 1)
	{
	fclose(pFile);
	return false;
	}

	if (hdr.m_sig != ASTC_SIG)
	{
	fclose(pFile);
	return false;
	}

	width = hdr.m_width[0] + (hdr.m_width[1] << 8) + (hdr.m_width[2] << 16);
	height = hdr.m_height[0] + (hdr.m_height[1] << 8) + (hdr.m_height[2] << 16);
	uint32_t depth = hdr.m_depth[0] + (hdr.m_depth[1] << 8) + (hdr.m_depth[2] << 16);

	if ((width < 1) \|\| (width > 32768) \|\| (height < 1) \|\| (height > 32768))
	return false;
	if ((hdr.m_block_z != 1) \|\| (depth != 1))
	return false;

	block_width = hdr.m_block_x;
	block_height = hdr.m_block_y;

	if ((block_width < 4) \|\| (block_width > 12) \|\| (block_height < 4) \|\| (block_height > 12))
	return false;

	uint32_t blocks_x = (width + block_width - 1) / block_width;
	uint32_t blocks_y = (height + block_height - 1) / block_height;
	uint32_t total_blocks = blocks_x * blocks_y;

	blocks.resize(total_blocks);

	if (fread(blocks.data(), 16, total_blocks, pFile) != total_blocks)
	{
	fclose(pFile);
	return false;
	}

	fclose(pFile);
	return true;
	}

	#if 0
	uint32_t get_deflate_size(const void* pData, size_t data_size)
	{
	size_t comp_size = 0;
	void* pPre_RDO_Comp_data = tdefl_compress_mem_to_heap(pData, data_size, &comp_size, TDEFL_MAX_PROBES_MASK);// TDEFL_DEFAULT_MAX_PROBES);
	mz_free(pPre_RDO_Comp_data);

	if (comp_size > UINT32_MAX)
	return UINT32_MAX;

	return (uint32_t)comp_size;
	}
	#endif

	bool read_file(const char* pFilename, uint8_vec& buf)
	{
	buf.resize(0);

	FILE* pFile = nullptr;
	#if _MSC_VER
	fopen_s(&pFile, pFilename, "rb");
	#else
	pFile = fopen(pFilename, "rb");
	#endif
	if (!pFile)
	return false;

	fseek(pFile, 0, SEEK_END);

	long file_end_ofs = ftell(pFile);
	if (file_end_ofs <= 0)
	{
	fclose(pFile);
	return false;
	}

	size_t sz = static_cast<size_t>(file_end_ofs);
	if (sz != (unsigned long)file_end_ofs)
	{
	fclose(pFile);
	return false;
	}

	fseek(pFile, 0, SEEK_SET);

	buf.resize(sz);

	if (fread(buf.data(), sizeof(uint8_t), sz, pFile) != sz)
	{
	fclose(pFile);
	return false;
	}

	fclose(pFile);
	return true;
	}

	} // namespace utils