Adding TGA reading support
diff --git a/basisu_enc.cpp b/basisu_enc.cpp
index 72ae1ea..e86bc27 100644
--- a/basisu_enc.cpp
+++ b/basisu_enc.cpp
@@ -229,6 +229,58 @@
 
 		return true;
 	}
+
+	bool load_tga(const char* pFilename, image& img)
+	{
+		int w = 0, h = 0, n_chans = 0;
+		uint8_t* pImage_data = read_tga(pFilename, w, h, n_chans);
+				
+		if ((!pImage_data) || (!w) || (!h) || ((n_chans != 3) && (n_chans != 4)))
+		{
+			error_printf("Failed loading .TGA image \"%s\"!\n", pFilename);
+
+			if (pImage_data)
+				free(pImage_data);
+						
+			return false;
+		}
+
+		if (sizeof(void *) == sizeof(uint32_t))
+		{
+			if ((w * h * n_chans) > MAX_32BIT_ALLOC_SIZE)
+			{
+				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", pFilename, w, h);
+
+				if (pImage_data)
+					free(pImage_data);
+
+				return false;
+			}
+		}
+		
+		img.resize(w, h);
+
+		const uint8_t *pSrc = pImage_data;
+		for (int y = 0; y < h; y++)
+		{
+			color_rgba *pDst = &img(0, y);
+
+			for (int x = 0; x < w; x++)
+			{
+				pDst->r = pSrc[0];
+				pDst->g = pSrc[1];
+				pDst->b = pSrc[2];
+				pDst->a = (n_chans == 3) ? 255 : pSrc[3];
+
+				pSrc += n_chans;
+				++pDst;
+			}
+		}
+
+		free(pImage_data);
+
+		return true;
+	}
 		
 	bool load_png(const char* pFilename, image& img)
 	{
@@ -287,6 +339,8 @@
 			return load_png(pFilename, img);
 		if (strcasecmp(pExt, "bmp") == 0)
 			return load_bmp(pFilename, img);
+		if (strcasecmp(pExt, "tga") == 0)
+			return load_tga(pFilename, img);
 
 		return false;
 	}
@@ -1455,4 +1509,386 @@
 		debug_printf("job_pool::job_thread: exiting\n");
 	}
 
+	// .TGA image loading
+	#pragma pack(push)
+	#pragma pack(1)
+	struct tga_header
+	{
+		uint8_t			m_id_len;
+		uint8_t			m_cmap;
+		uint8_t			m_type;
+		packed_uint<2>	m_cmap_first;
+		packed_uint<2> m_cmap_len;
+		uint8_t			m_cmap_bpp;
+		packed_uint<2> m_x_org;
+		packed_uint<2> m_y_org;
+		packed_uint<2> m_width;
+		packed_uint<2> m_height;
+		uint8_t			m_depth;
+		uint8_t			m_desc;
+	};
+	#pragma pack(pop)
+
+	const uint32_t MAX_TGA_IMAGE_SIZE = 16384;
+
+	enum tga_image_type
+	{
+		cITPalettized = 0,
+		cITRGB = 1,
+		cITGrayscale = 2
+	};
+
+	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans)
+	{
+		width = 0;
+		height = 0;
+		n_chans = 0;
+
+		if (buf_size <= sizeof(tga_header))
+			return nullptr;
+
+		const tga_header &hdr = *reinterpret_cast<const tga_header *>(pBuf);
+
+		if ((!hdr.m_width) || (!hdr.m_height) || (hdr.m_width > MAX_TGA_IMAGE_SIZE) || (hdr.m_height > MAX_TGA_IMAGE_SIZE))
+			return nullptr;
+
+		if (hdr.m_desc >> 6)
+			return nullptr;
+
+		// Simple validation
+		if ((hdr.m_cmap != 0) && (hdr.m_cmap != 1))
+			return nullptr;
+		
+		if (hdr.m_cmap)
+		{
+			// We don't support 32-bit palettized RGBA .TGA files (we have nothing to test with!).
+			if ((hdr.m_cmap_bpp == 0) || (hdr.m_cmap_bpp > 24))
+				return nullptr;
+
+			// Nobody implements CMapFirst correctly, so we're not supporting it. Never seen it used, either.
+			if (hdr.m_cmap_first != 0)
+				return nullptr;
+		}
+
+		const bool x_flipped = (hdr.m_desc & 0x10) != 0;
+		const bool y_flipped = (hdr.m_desc & 0x20) == 0;
+
+		bool rle_flag = false;
+		int file_image_type = hdr.m_type;
+		if (file_image_type > 8)
+		{
+			file_image_type -= 8;
+			rle_flag = true;
+		}
+
+		tga_image_type image_type;
+
+		switch (file_image_type)
+		{
+		case 2:
+			if (hdr.m_depth == 8)
+				return nullptr;
+			image_type = cITRGB;
+			break;
+		case 1:
+			if ((hdr.m_depth != 8) || (hdr.m_cmap != 1) || (hdr.m_cmap_len == 0))
+				return nullptr;
+			image_type = cITPalettized;
+			break;
+		case 3:
+			if ((hdr.m_depth != 8) || (hdr.m_cmap != 0) || (hdr.m_cmap_len != 0))
+				return nullptr;
+			image_type = cITGrayscale;
+			break;
+		default:
+			return nullptr;
+		}
+
+		uint32_t bytes_per_pixel = 0;
+
+		switch (hdr.m_depth)
+		{
+		case 32:
+			bytes_per_pixel = 4;
+			n_chans = 4;
+			break;
+		case 24:
+			bytes_per_pixel = 3;
+			n_chans = 3;
+			break;
+		case 16:
+		case 15:
+			bytes_per_pixel = 2;
+			n_chans = 3;
+			break;
+		case 8:
+			bytes_per_pixel = 1;
+			n_chans = 3;
+			break;
+		default:
+			return nullptr;
+		}
+
+		const uint32_t bytes_per_line = hdr.m_width * bytes_per_pixel;
+
+		const uint8_t *pSrc = pBuf + sizeof(tga_header);
+		uint32_t bytes_remaining = buf_size - sizeof(tga_header);
+
+		if (hdr.m_id_len)
+		{
+			if (bytes_remaining < hdr.m_id_len)
+				return nullptr;
+			pSrc += hdr.m_id_len;
+			bytes_remaining += hdr.m_id_len;
+		}
+
+		color_rgba pal[256];
+		for (uint32_t i = 0; i < 256; i++)
+			pal[i].set(0, 0, 0, 255);
+
+		if ((hdr.m_cmap) && (hdr.m_cmap_len))
+		{
+			if (image_type == cITPalettized)
+			{
+				// I cannot find any files using 32bpp palettes in the wild (never seen any in ~30 years).
+				if ( ((hdr.m_cmap_bpp != 24) && (hdr.m_cmap_bpp != 15) && (hdr.m_cmap_bpp != 16)) || (hdr.m_cmap_len > 256) )
+					return nullptr;
+
+				if (hdr.m_cmap_bpp == 24)
+				{
+					const uint32_t pal_size = hdr.m_cmap_len * 3;
+					if (bytes_remaining < pal_size)
+						return nullptr;
+
+					for (uint32_t i = 0; i < hdr.m_cmap_len; i++)
+					{
+						pal[i].r = pSrc[i * 3 + 2];
+						pal[i].g = pSrc[i * 3 + 1];
+						pal[i].b = pSrc[i * 3 + 0];
+						pal[i].a = 255;
+					}
+
+					bytes_remaining -= pal_size;
+					pSrc += pal_size;
+				}
+				else
+				{
+					const uint32_t pal_size = hdr.m_cmap_len * 2;
+					if (bytes_remaining < pal_size)
+						return nullptr;
+
+					for (uint32_t i = 0; i < hdr.m_cmap_len; i++)
+					{
+						const uint32_t v = pSrc[i * 2 + 0] | (pSrc[i * 2 + 1] << 8);
+
+						pal[i].r = (((v >> 10) & 31) * 255 + 15) / 31;
+						pal[i].g = (((v >> 5) & 31) * 255 + 15) / 31;
+						pal[i].b = ((v & 31) * 255 + 15) / 31;
+						pal[i].a = 255;
+					}
+
+					bytes_remaining -= pal_size;
+					pSrc += pal_size;
+				}
+			}
+			else
+			{
+				const uint32_t bytes_to_skip = (hdr.m_cmap_bpp >> 3) * hdr.m_cmap_len;
+				if (bytes_remaining < bytes_to_skip)
+					return nullptr;
+				pSrc += bytes_to_skip;
+				bytes_remaining += bytes_to_skip;
+			}
+		}
+		else if (image_type == cITPalettized)
+		{
+			for (uint32_t i = 0; i < 256; i++)
+				pal[i].set(i, i, i, 255);
+		}
+
+		width = hdr.m_width;
+		height = hdr.m_height;
+
+		const uint32_t source_pitch = width * bytes_per_pixel;
+		const uint32_t dest_pitch = width * n_chans;
+		
+		uint8_t *pImage = (uint8_t *)malloc(dest_pitch * height);
+		if (!pImage)
+			return nullptr;
+
+		std::vector<uint8_t> input_line_buf;
+		if (rle_flag)
+			input_line_buf.resize(source_pitch);
+
+		int run_type = 0, run_remaining = 0;
+		uint8_t run_pixel[4];
+		memset(run_pixel, 0, sizeof(run_pixel));
+
+		for (int y = 0; y < height; y++)
+		{
+			const uint8_t *pLine_data;
+
+			if (rle_flag)
+			{
+				int pixels_remaining = width;
+				uint8_t *pDst = &input_line_buf[0];
+
+				do 
+				{
+					if (!run_remaining)
+					{
+						if (bytes_remaining < 1)
+						{
+							free(pImage);
+							return nullptr;
+						}
+
+						int v = *pSrc++;
+						bytes_remaining--;
+
+						run_type = v & 0x80;
+						run_remaining = (v & 0x7F) + 1;
+
+						if (run_type)
+						{
+							if (bytes_remaining < bytes_per_pixel)
+							{
+								free(pImage);
+								return nullptr;
+							}
+
+							memcpy(run_pixel, pSrc, bytes_per_pixel);
+							pSrc += bytes_per_pixel;
+							bytes_remaining -= bytes_per_pixel;
+						}
+					}
+
+					const uint32_t n = std::min<uint32_t>(pixels_remaining, run_remaining);
+					pixels_remaining -= n;
+					run_remaining -= n;
+
+					if (run_type)
+					{
+						for (uint32_t i = 0; i < n; i++)
+							for (uint32_t j = 0; j < bytes_per_pixel; j++)
+								*pDst++ = run_pixel[j];
+					}
+					else
+					{
+						const uint32_t bytes_wanted = n * bytes_per_pixel;
+
+						if (bytes_remaining < bytes_wanted)
+						{
+							free(pImage);
+							return nullptr;
+						}
+
+						memcpy(pDst, pSrc, bytes_wanted);
+						pDst += bytes_wanted;
+
+						pSrc += bytes_wanted;
+						bytes_remaining -= bytes_wanted;
+					}
+
+				} while (pixels_remaining);
+
+				assert((pDst - &input_line_buf[0]) == width * bytes_per_pixel);
+
+				pLine_data = &input_line_buf[0];
+			}
+			else
+			{
+				if (bytes_remaining < source_pitch)
+				{
+					free(pImage);
+					return nullptr;
+				}
+
+				pLine_data = pSrc;
+				bytes_remaining -= source_pitch;
+				pSrc += source_pitch;
+			}
+
+			// Convert to 24bpp RGB or 32bpp RGBA.
+			uint8_t *pDst = pImage + (y_flipped ? (height - 1 - y) : y) * dest_pitch + (x_flipped ? (width - 1) * n_chans : 0);
+			const int dst_stride = x_flipped ? -((int)n_chans) : n_chans;
+
+			switch (hdr.m_depth)
+			{
+			case 32:
+				assert(n_chans == 4);
+				for (int i = 0; i < width; i++, pLine_data += 4, pDst += dst_stride)
+				{
+					pDst[0] = pLine_data[2];
+					pDst[1] = pLine_data[1];
+					pDst[2] = pLine_data[0];
+					pDst[3] = pLine_data[3];
+				}
+				break;
+			case 24:
+				assert(n_chans == 3);
+				for (int i = 0; i < width; i++, pLine_data += 3, pDst += dst_stride)
+				{
+					pDst[0] = pLine_data[2];
+					pDst[1] = pLine_data[1];
+					pDst[2] = pLine_data[0];
+				}
+				break;
+			case 16:
+			case 15:
+				assert(n_chans == 3);
+				for (int i = 0; i < width; i++, pLine_data += 2, pDst += dst_stride)
+				{
+					const uint32_t v = pLine_data[0] | (pLine_data[1] << 8);
+					pDst[0] = (((v >> 10) & 31) * 255 + 15) / 31;
+					pDst[1] = (((v >> 5) & 31) * 255 + 15) / 31;
+					pDst[2] = ((v & 31) * 255 + 15) / 31;
+				}
+				break;
+			case 8:
+				assert(n_chans == 3);
+				if (image_type == cITPalettized)
+				{
+					for (int i = 0; i < width; i++, pLine_data++, pDst += dst_stride)
+					{
+						const uint32_t c = *pLine_data;
+						pDst[0] = pal[c].r;
+						pDst[1] = pal[c].g;
+						pDst[2] = pal[c].b;
+					}
+				}
+				else
+				{
+					for (int i = 0; i < width; i++, pLine_data++, pDst += dst_stride)
+					{
+						const uint8_t c = *pLine_data;
+						pDst[0] = c;
+						pDst[1] = c;
+						pDst[2] = c;
+					}
+				}
+				break;
+			default:
+				assert(0);
+				break;
+			}
+		} // y
+
+		return pImage;
+	}
+
+	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans)
+	{
+		width = height = n_chans = 0;
+
+		uint8_vec filedata;
+		if (!read_file_to_vec(pFilename, filedata))
+			return nullptr;
+
+		if (!filedata.size() || (filedata.size() > UINT32_MAX))
+			return nullptr;
+		
+		return read_tga(&filedata[0], (uint32_t)filedata.size(), width, height, n_chans);
+	}
+
 } // namespace basisu
diff --git a/basisu_enc.h b/basisu_enc.h
index 146111b..43b0728 100644
--- a/basisu_enc.h
+++ b/basisu_enc.h
@@ -27,6 +27,8 @@
 #include <libgen.h>
 #endif
 
+// This module is really just a huge grab bag of classes and helper functions needed by the encoder.
+
 namespace basisu
 {
 	extern uint8_t g_hamming_dist[256];
@@ -2798,11 +2800,20 @@
 		
 	bool load_png(const char* pFilename, image& img);
 	inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }
+
+	bool load_bmp(const char* pFilename, image& img);
+	inline bool load_bmp(const std::string &filename, image &img) { return load_bmp(filename.c_str(), img); }
+		
+	bool load_tga(const char* pFilename, image& img);
+	inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
 	
-	// Currently loads .BMP or .PNG.
+	// Currently loads .BMP, .PNG, or .TGA.
 	bool load_image(const char* pFilename, image& img);
 	inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
 
+	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);
+	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);
+
 	enum
 	{
 		cImageSaveGrayscale = 1,
@@ -2964,7 +2975,7 @@
 	}
 
 	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);
-
+		
 } // namespace basisu
 
 
diff --git a/basisu_tool.cpp b/basisu_tool.cpp
index 44eb154..5e36d0f 100644
--- a/basisu_tool.cpp
+++ b/basisu_tool.cpp
@@ -52,11 +52,11 @@
 	printf("\nUsage: basisu filename [filename ...] <options>\n");
 	
 	puts("\n"
-		"The default mode is compression of one or more PNG/BMP files to a .basis file. Alternate modes:\n"
+		"The default mode is compression of one or more PNG/BMP/TGA files to a .basis file. Alternate modes:\n"
 		" -unpack: Use transcoder to unpack .basis file to one or more .ktx/.png files\n"
 		" -validate: Validate and display information about a .basis file\n"
 		" -info: Display high-level information about a .basis file\n"
-		" -compare: Compare two PNG/BMP images specified with -file, output PSNR and SSIM statistics and RGB/A delta images\n"
+		" -compare: Compare two PNG/BMP/TGA images specified with -file, output PSNR and SSIM statistics and RGB/A delta images\n"
 		" -version: Print basisu version and exit\n"
 		"Unless an explicit mode is specified, if one or more files have the .basis extension this tool defaults to unpack mode.\n"
 		"\n"
@@ -66,8 +66,8 @@
 		"Filenames prefixed with a @ symbol are read as filename listing files. Listing text files specify which actual filenames to process (one filename per line).\n"
 		"\n"
 		"Options:\n"
-		" -file filename.png/bmp: Input image filename, multiple images are OK, use -file X for each input filename (prefixing input filenames with -file is optional)\n"
-		" -alpha_file filename.png/bmp: Input alpha image filename, multiple images are OK, use -file X for each input filename (must be paired with -file), images converted to REC709 grayscale and used as input alpha\n"
+		" -file filename.png/bmp/tga: Input image filename, multiple images are OK, use -file X for each input filename (prefixing input filenames with -file is optional)\n"
+		" -alpha_file filename.png/bmp/tga: Input alpha image filename, multiple images are OK, use -file X for each input filename (must be paired with -file), images converted to REC709 grayscale and used as input alpha\n"
 		" -multifile_printf: printf() format strint to use to compose multiple filenames\n"
 		" -multifile_first: The index of the first file to process, default is 0 (must specify -multifile_printf and -multifile_num)\n"
 		" -multifile_num: The total number of files to process.\n"