- Adding support for skip blocks, i-frames and p-frames to the backend and transcoder, for texture video use.
- For PVRTC1, we now memset() any extra blocks after the blocks we know will be transcoded into. This is done because in GL there is padding that occurs on textures smaller than 8x8.
- Removing all OpenMP usage and replacing it with a basis C++11 job system.
- Fixing TSVQ class so it can split very long "thin" clusters that before weren't being split. This improves endpoint quantization on videos and complex 2D textures.
- More frontend perf. optimizations.

Tested on OSX, Linux with gcc/clang, and Win using MSVC 2019.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62d4a41..7302fe0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 
 set(CMAKE_CXX_FLAGS -std=c++11)
-set(GCC_COMPILE_FLAGS "-fvisibility=hidden -fvisibility-inlines-hidden -fPIC -fopenmp -fno-strict-aliasing -D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64 -Wall -Wextra -Wno-unused-local-typedefs -Wno-unused-value -Wno-unused-parameter -Wno-unused-but-set-variable -Wno-unused-variable -Wno-reorder")
+set(GCC_COMPILE_FLAGS "-fvisibility=hidden -fvisibility-inlines-hidden -fPIC -fno-strict-aliasing -D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64 -Wall -Wextra -Wno-unused-local-typedefs -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-reorder")
 
 if (NOT BUILD_X64)
 	set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -m32")
diff --git a/basisu.vcxproj b/basisu.vcxproj
index 332ae13..88de436 100644
--- a/basisu.vcxproj
+++ b/basisu.vcxproj
@@ -98,7 +98,6 @@
       <WarningLevel>Level4</WarningLevel>
       <Optimization>Disabled</Optimization>
       <SDLCheck>true</SDLCheck>
-      <OpenMPSupport>true</OpenMPSupport>
       <AdditionalIncludeDirectories>
       </AdditionalIncludeDirectories>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
@@ -134,7 +133,6 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <SDLCheck>true</SDLCheck>
-      <OpenMPSupport>true</OpenMPSupport>
       <AdditionalIncludeDirectories>
       </AdditionalIncludeDirectories>
       <PreprocessorDefinitions>NDEBUG;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
diff --git a/basisu_backend.cpp b/basisu_backend.cpp
index 665320c..0ec918a 100644
--- a/basisu_backend.cpp
+++ b/basisu_backend.cpp
@@ -44,7 +44,7 @@
 		m_output.clear();
 	}
 
-	void basisu_backend::init(basisu_frontend *pFront_end, basisu_backend_params &params, const basisu_backend_slice_desc_vec &slice_descs, const basist::etc1_global_selector_codebook *pGlobal_sel_codebook)
+	void basisu_backend::init(basisu_frontend* pFront_end, basisu_backend_params& params, const basisu_backend_slice_desc_vec& slice_descs, const basist::etc1_global_selector_codebook* pGlobal_sel_codebook)
 	{
 		m_pFront_end = pFront_end;
 		m_params = params;
@@ -62,7 +62,7 @@
 			params.m_use_hybrid_sel_codebooks);
 
 		debug_printf("Frontend endpoints: %u selectors: %u\n", m_pFront_end->get_total_endpoint_clusters(), m_pFront_end->get_total_selector_clusters());
-		
+
 		for (uint32_t i = 0; i < m_slices.size(); i++)
 		{
 			debug_printf("Slice: %u, OrigWidth: %u, OrigHeight: %u, Width: %u, Height: %u, NumBlocksX: %u, NumBlocksY: %u, FirstBlockIndex: %u\n",
@@ -76,15 +76,15 @@
 
 	void basisu_backend::create_endpoint_palette()
 	{
-		const basisu_frontend &r = *m_pFront_end;
+		const basisu_frontend& r = *m_pFront_end;
 
 		m_output.m_num_endpoints = r.get_total_endpoint_clusters();
 
 		m_endpoint_palette.resize(r.get_total_endpoint_clusters());
 		for (uint32_t i = 0; i < r.get_total_endpoint_clusters(); i++)
 		{
-			etc1_endpoint_palette_entry &e = m_endpoint_palette[i];
-			
+			etc1_endpoint_palette_entry& e = m_endpoint_palette[i];
+
 			e.m_color5_valid = r.get_endpoint_cluster_color_is_used(i, false);
 			e.m_color5 = r.get_endpoint_cluster_unscaled_color(i, false);
 			e.m_inten5 = r.get_endpoint_cluster_inten_table(i, false);
@@ -95,7 +95,7 @@
 
 	void basisu_backend::create_selector_palette()
 	{
-		const basisu_frontend &r = *m_pFront_end;
+		const basisu_frontend& r = *m_pFront_end;
 
 		m_output.m_num_selectors = r.get_total_selector_clusters();
 
@@ -107,9 +107,9 @@
 
 			for (int i = 0; i < static_cast<int>(r.get_total_selector_clusters()); i++)
 			{
-				basist::etc1_selector_palette_entry &selector_pal_entry = m_selector_palette[i];
+				basist::etc1_selector_palette_entry& selector_pal_entry = m_selector_palette[i];
 
-				etc1_global_selector_cb_entry_desc &pal_entry_desc = m_global_selector_palette_desc[i];
+				etc1_global_selector_cb_entry_desc& pal_entry_desc = m_global_selector_palette_desc[i];
 				pal_entry_desc.m_pal_index = r.get_selector_cluster_global_selector_entry_ids()[i].m_palette_index;
 				pal_entry_desc.m_mod_index = r.get_selector_cluster_global_selector_entry_ids()[i].m_modifier.get_index();
 
@@ -119,7 +119,7 @@
 
 				if (pal_entry_desc.m_was_used)
 				{
-					const etc_block &selector_bits = r.get_selector_cluster_selector_bits(i);
+					const etc_block& selector_bits = r.get_selector_cluster_selector_bits(i);
 					(void)selector_bits;
 
 					basist::etc1_selector_palette_entry global_pal_entry(m_pGlobal_sel_codebook->get_entry(r.get_selector_cluster_global_selector_entry_ids()[i]));
@@ -136,7 +136,7 @@
 				}
 				else
 				{
-					const etc_block &selector_bits = r.get_selector_cluster_selector_bits(i);
+					const etc_block& selector_bits = r.get_selector_cluster_selector_bits(i);
 
 					for (uint32_t y = 0; y < 4; y++)
 						for (uint32_t x = 0; x < 4; x++)
@@ -148,9 +148,9 @@
 		{
 			for (uint32_t i = 0; i < r.get_total_selector_clusters(); i++)
 			{
-				basist::etc1_selector_palette_entry &s = m_selector_palette[i];
+				basist::etc1_selector_palette_entry& s = m_selector_palette[i];
 
-				const etc_block &selector_bits = r.get_selector_cluster_selector_bits(i);
+				const etc_block& selector_bits = r.get_selector_cluster_selector_bits(i);
 
 				for (uint32_t y = 0; y < 4; y++)
 				{
@@ -162,28 +162,27 @@
 			}
 		}
 	}
-	
-	static const struct 
+
+	static const struct
 	{
 		int8_t m_dx, m_dy;
-	} g_endpoint_preds[] = 
+	} g_endpoint_preds[] =
 	{
 		{ -1, 0 },
 		{ 0, -1 },
 		{ -1, -1 }
 	};
-	const uint32_t NUM_ENDPOINT_PREDS = BASISU_ARRAY_SIZE(g_endpoint_preds);
-	const uint32_t NO_ENDPOINT_PRED_INDEX = 3;//NUM_ENDPOINT_PREDS;
 
-	void basisu_backend::reoptimize_and_sort_endpoints_codebook(uint32_t total_block_endpoints_remapped, uint_vec &all_endpoint_indices)
+	void basisu_backend::reoptimize_and_sort_endpoints_codebook(uint32_t total_block_endpoints_remapped, uint_vec& all_endpoint_indices)
 	{
-		basisu_frontend &r = *m_pFront_end;
+		basisu_frontend& r = *m_pFront_end;
+		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
 
 		if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0))
 		{
 			// We're changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed)
 			uint_vec new_block_endpoints(get_total_blocks());
-		
+
 			for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 			{
 				const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
@@ -197,7 +196,7 @@
 
 			int_vec old_to_new_endpoint_indices;
 			r.reoptimize_remapped_endpoints(new_block_endpoints, old_to_new_endpoint_indices, true);
-			
+
 			create_endpoint_palette();
 
 			for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
@@ -215,8 +214,8 @@
 					{
 						const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
 
-						encoder_block &m = m_slice_encoder_blocks[slice_index](block_x, block_y);
-						
+						encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
 						m.m_endpoint_index = old_to_new_endpoint_indices[m.m_endpoint_index];
 					} // block_x
 				} // block_y
@@ -224,7 +223,7 @@
 
 			for (uint32_t i = 0; i < all_endpoint_indices.size(); i++)
 				all_endpoint_indices[i] = old_to_new_endpoint_indices[all_endpoint_indices[i]];
-		
+
 		} //if (total_block_endpoints_remapped)
 
 		// Sort endpoint codebook
@@ -239,64 +238,165 @@
 
 	void basisu_backend::sort_selector_codebook()
 	{
-		basisu_frontend &r = *m_pFront_end;
+		basisu_frontend& r = *m_pFront_end;
 
 		m_selector_remap_table_new_to_old.resize(r.get_total_selector_clusters());
-		
-		m_selector_remap_table_new_to_old[0] = 0;
-		uint32_t prev_selector_index = 0;
 
-		int_vec remaining_selectors;
-		remaining_selectors.reserve(r.get_total_selector_clusters() - 1);
-		for (uint32_t i = 1; i < r.get_total_selector_clusters(); i++)
-			remaining_selectors.push_back(i);
-
-		uint_vec selector_palette_bytes(m_selector_palette.size());
-		for (uint32_t i = 0; i < m_selector_palette.size(); i++)
-			selector_palette_bytes[i] = m_selector_palette[i].get_byte(0) | (m_selector_palette[i].get_byte(1) << 8) |(m_selector_palette[i].get_byte(2) << 16) |(m_selector_palette[i].get_byte(3) << 24);
-
-		// This is the traveling salesman problem.
-		for (uint32_t i = 1; i < r.get_total_selector_clusters(); i++)
+		if (m_params.m_compression_level == 0)
 		{
-			uint32_t best_hamming_dist = 100;
-			uint32_t best_index = 0;
+			for (uint32_t i = 0; i < r.get_total_selector_clusters(); i++)
+				m_selector_remap_table_new_to_old[i] = i;
+		}
+		else
+		{
+			m_selector_remap_table_new_to_old[0] = 0;
+			uint32_t prev_selector_index = 0;
+
+			int_vec remaining_selectors;
+			remaining_selectors.reserve(r.get_total_selector_clusters() - 1);
+			for (uint32_t i = 1; i < r.get_total_selector_clusters(); i++)
+				remaining_selectors.push_back(i);
+
+			uint_vec selector_palette_bytes(m_selector_palette.size());
+			for (uint32_t i = 0; i < m_selector_palette.size(); i++)
+				selector_palette_bytes[i] = m_selector_palette[i].get_byte(0) | (m_selector_palette[i].get_byte(1) << 8) | (m_selector_palette[i].get_byte(2) << 16) | (m_selector_palette[i].get_byte(3) << 24);
+
+			// This is the traveling salesman problem.
+			for (uint32_t i = 1; i < r.get_total_selector_clusters(); i++)
+			{
+				uint32_t best_hamming_dist = 100;
+				uint32_t best_index = 0;
 
 #if BASISU_FASTER_SELECTOR_REORDERING
-			const uint32_t step = (remaining_selectors.size() > 16) ? 16 : 1;
-			for (uint32_t j = 0; j < remaining_selectors.size(); j += step)
+				const uint32_t step = (remaining_selectors.size() > 16) ? 16 : 1;
+				for (uint32_t j = 0; j < remaining_selectors.size(); j += step)
 #else
-			for (uint32_t j = 0; j < remaining_selectors.size(); j++)
+				for (uint32_t j = 0; j < remaining_selectors.size(); j++)
 #endif
-			{
-				int selector_index = remaining_selectors[j];
-				
-				uint32_t k = selector_palette_bytes[prev_selector_index] ^ selector_palette_bytes[selector_index];
-				uint32_t hamming_dist = g_hamming_dist[k & 0xFF] + g_hamming_dist[(k >> 8) & 0xFF] + g_hamming_dist[(k >> 16) & 0xFF] + g_hamming_dist[k >> 24];
-				
-				if (hamming_dist < best_hamming_dist)
 				{
-					best_hamming_dist = hamming_dist;
-					best_index = j;
-					if (best_hamming_dist <= 1)
-						break;
+					int selector_index = remaining_selectors[j];
+
+					uint32_t k = selector_palette_bytes[prev_selector_index] ^ selector_palette_bytes[selector_index];
+					uint32_t hamming_dist = g_hamming_dist[k & 0xFF] + g_hamming_dist[(k >> 8) & 0xFF] + g_hamming_dist[(k >> 16) & 0xFF] + g_hamming_dist[k >> 24];
+
+					if (hamming_dist < best_hamming_dist)
+					{
+						best_hamming_dist = hamming_dist;
+						best_index = j;
+						if (best_hamming_dist <= 1)
+							break;
+					}
 				}
+
+				prev_selector_index = remaining_selectors[best_index];
+				m_selector_remap_table_new_to_old[i] = prev_selector_index;
+
+				remaining_selectors[best_index] = remaining_selectors.back();
+				remaining_selectors.resize(remaining_selectors.size() - 1);
 			}
-			
-			prev_selector_index = remaining_selectors[best_index];
-			m_selector_remap_table_new_to_old[i] = prev_selector_index;
-			
-			remaining_selectors[best_index] = remaining_selectors.back();
-			remaining_selectors.resize(remaining_selectors.size() - 1);
 		}
-		
+
 		m_selector_remap_table_old_to_new.resize(r.get_total_selector_clusters());
 		for (uint32_t i = 0; i < m_selector_remap_table_new_to_old.size(); i++)
 			m_selector_remap_table_old_to_new[m_selector_remap_table_new_to_old[i]] = i;
 	}
-				
+	int basisu_backend::find_video_frame(int slice_index, int delta)
+	{
+		for (uint32_t s = 0; s < m_slices.size(); s++)
+		{
+			if ((int)m_slices[s].m_source_file_index != ((int)m_slices[slice_index].m_source_file_index + delta))
+				continue;
+			if (m_slices[s].m_mip_index != m_slices[slice_index].m_mip_index)
+				continue;
+
+			// Being super paranoid here.
+			if (m_slices[s].m_num_blocks_x != (m_slices[slice_index].m_num_blocks_x))
+				continue;
+			if (m_slices[s].m_num_blocks_y != (m_slices[slice_index].m_num_blocks_y))
+				continue;
+			if (m_slices[s].m_alpha != (m_slices[slice_index].m_alpha))
+				continue;
+			return s;
+		}
+
+		return -1;
+	}
+
+	void basisu_backend::check_for_valid_cr_blocks()
+	{
+		basisu_frontend& r = *m_pFront_end;
+		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+
+		if (!is_video)
+			return;
+
+		uint32_t total_crs = 0;
+		uint32_t total_invalid_crs = 0;
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const bool is_iframe = m_slices[slice_index].m_iframe;
+			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+
+			const uint32_t width = m_slices[slice_index].m_width;
+			const uint32_t height = m_slices[slice_index].m_height;
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+			const int prev_frame_slice_index = find_video_frame(slice_index, -1);
+
+			// If we don't have a previous frame, and we're not an i-frame, something is wrong.
+			if ((prev_frame_slice_index < 0) && (!is_iframe))
+			{
+				BASISU_BACKEND_VERIFY(0);
+			}
+
+			if ((is_iframe) || (prev_frame_slice_index < 0))
+			{
+				// Ensure no blocks use CR's
+				for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				{
+					for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					{
+						encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+						BASISU_BACKEND_VERIFY(m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX);
+					}
+				}
+			}
+			else
+			{
+				// For blocks that use CR's, make sure the endpoints/selectors haven't really changed.
+				for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				{
+					for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					{
+						encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
+						if (m.m_endpoint_predictor == basist::CR_ENDPOINT_PRED_INDEX)
+						{
+							total_crs++;
+
+							encoder_block& prev_m = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y);
+
+							if ((m.m_endpoint_index != prev_m.m_endpoint_index) || (m.m_selector_index != prev_m.m_selector_index))
+							{
+								total_invalid_crs++;
+							}
+						}
+					} // block_x
+				} // block_y
+
+			} // !slice_index
+
+		} // slice_index
+
+		debug_printf("Total CR's: %u, Total invalid CR's: %u\n", total_crs, total_invalid_crs);
+
+		BASISU_BACKEND_VERIFY(total_invalid_crs == 0);
+	}
 	void basisu_backend::create_encoder_blocks()
 	{
-		basisu_frontend &r = *m_pFront_end;
+		basisu_frontend& r = *m_pFront_end;
+		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
 
 		m_slice_encoder_blocks.resize(m_slices.size());
 
@@ -304,9 +404,11 @@
 
 		uint_vec all_endpoint_indices;
 		all_endpoint_indices.reserve(get_total_blocks());
-										
+
 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
+			const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1;
+			const bool is_iframe = m_slices[slice_index].m_iframe;
 			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
 
 			const uint32_t width = m_slices[slice_index].m_width;
@@ -322,50 +424,68 @@
 				{
 					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
 
-					encoder_block &m = m_slice_encoder_blocks[slice_index](block_x, block_y);
-																				
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
 					m.m_endpoint_index = r.get_subblock_endpoint_cluster_index(block_index, 0);
 					BASISU_BACKEND_VERIFY(r.get_subblock_endpoint_cluster_index(block_index, 0) == r.get_subblock_endpoint_cluster_index(block_index, 1));
 
 					m.m_selector_index = r.get_block_selector_cluster_index(block_index);
 
-					m.m_endpoint_predictor = NO_ENDPOINT_PRED_INDEX;
+					m.m_endpoint_predictor = basist::NO_ENDPOINT_PRED_INDEX;
 
 					const uint32_t block_endpoint = m.m_endpoint_index;
 
 					uint32_t best_endpoint_pred = UINT32_MAX;
 
-					for (uint32_t endpoint_pred = 0; endpoint_pred < NUM_ENDPOINT_PREDS; endpoint_pred++)
+					for (uint32_t endpoint_pred = 0; endpoint_pred < basist::NUM_ENDPOINT_PREDS; endpoint_pred++)
 					{
-						int pred_block_x = block_x + g_endpoint_preds[endpoint_pred].m_dx;
-						if ((pred_block_x < 0) || (pred_block_x >= (int)num_blocks_x))
-							continue;
-
-						int pred_block_y = block_y + g_endpoint_preds[endpoint_pred].m_dy;
-						if ((pred_block_y < 0) || (pred_block_y >= (int)num_blocks_y))
-							continue;
-						
-						uint32_t pred_endpoint = m_slice_encoder_blocks[slice_index](pred_block_x, pred_block_y).m_endpoint_index;
-									
-						if (pred_endpoint == block_endpoint)
+						if ((is_video) && (endpoint_pred == basist::CR_ENDPOINT_PRED_INDEX))
 						{
-							if (endpoint_pred < best_endpoint_pred)
+							if ((prev_frame_slice_index != -1) && (!is_iframe))
 							{
-								best_endpoint_pred = endpoint_pred;
+								const uint32_t cur_endpoint = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index;
+								const uint32_t cur_selector = m_slice_encoder_blocks[slice_index](block_x, block_y).m_selector_index;
+								const uint32_t prev_endpoint = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_endpoint_index;
+								const uint32_t prev_selector = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_selector_index;
+								if ((cur_endpoint == prev_endpoint) && (cur_selector == prev_selector))
+								{
+									best_endpoint_pred = basist::CR_ENDPOINT_PRED_INDEX;
+									m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_is_cr_target = true;
+								}
 							}
 						}
-					
+						else
+						{
+							int pred_block_x = block_x + g_endpoint_preds[endpoint_pred].m_dx;
+							if ((pred_block_x < 0) || (pred_block_x >= (int)num_blocks_x))
+								continue;
+
+							int pred_block_y = block_y + g_endpoint_preds[endpoint_pred].m_dy;
+							if ((pred_block_y < 0) || (pred_block_y >= (int)num_blocks_y))
+								continue;
+
+							uint32_t pred_endpoint = m_slice_encoder_blocks[slice_index](pred_block_x, pred_block_y).m_endpoint_index;
+
+							if (pred_endpoint == block_endpoint)
+							{
+								if (endpoint_pred < best_endpoint_pred)
+								{
+									best_endpoint_pred = endpoint_pred;
+								}
+							}
+						}
+
 					} // endpoint_pred
 
 					if (best_endpoint_pred != UINT32_MAX)
 					{
 						m.m_endpoint_predictor = best_endpoint_pred;
-						
+
 						total_endpoint_pred_hits++;
 					}
 					else if (m_params.m_endpoint_rdo_quality_thresh > 0.0f)
 					{
-						const pixel_block &src_pixels = r.get_source_pixel_block(block_index);
+						const pixel_block& src_pixels = r.get_source_pixel_block(block_index);
 
 						etc_block etc_blk(r.get_output_block(block_index));
 
@@ -379,11 +499,13 @@
 
 							uint64_t best_err = UINT64_MAX;
 							uint32_t best_endpoint_index = 0;
-							
+
 							best_endpoint_pred = UINT32_MAX;
-												
-							for (uint32_t endpoint_pred = 0; endpoint_pred < NUM_ENDPOINT_PREDS; endpoint_pred++)
+
+							for (uint32_t endpoint_pred = 0; endpoint_pred < basist::NUM_ENDPOINT_PREDS; endpoint_pred++)
 							{
+								if ((is_video) && (endpoint_pred == basist::CR_ENDPOINT_PRED_INDEX))
+									continue;
 								int pred_block_x = block_x + g_endpoint_preds[endpoint_pred].m_dx;
 								if ((pred_block_x < 0) || (pred_block_x >= (int)num_blocks_x))
 									continue;
@@ -391,12 +513,12 @@
 								int pred_block_y = block_y + g_endpoint_preds[endpoint_pred].m_dy;
 								if ((pred_block_y < 0) || (pred_block_y >= (int)num_blocks_y))
 									continue;
-						
+
 								uint32_t pred_endpoint_index = m_slice_encoder_blocks[slice_index](pred_block_x, pred_block_y).m_endpoint_index;
 
 								uint32_t pred_inten = r.get_endpoint_cluster_inten_table(pred_endpoint_index, false);
 								color_rgba pred_color = r.get_endpoint_cluster_unscaled_color(pred_endpoint_index, false);
-								
+
 								trial_etc_block.set_block_color5(pred_color, pred_color);
 								trial_etc_block.set_inten_table(0, pred_inten);
 								trial_etc_block.set_inten_table(1, pred_inten);
@@ -411,7 +533,7 @@
 									if (trial_err > thresh_err)
 										break;
 								}
-								
+
 								if (trial_err <= thresh_err)
 								{
 									if ((trial_err < best_err) || ((trial_err == best_err) && (endpoint_pred < best_endpoint_pred)))
@@ -442,25 +564,26 @@
 						total_endpoint_pred_missed++;
 					}
 
-					if (m.m_endpoint_predictor == NO_ENDPOINT_PRED_INDEX)
+					if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX)
 					{
 						all_endpoint_indices.push_back(m.m_endpoint_index);
 					}
-					
+
 				} // block_x
 
 			} // block_y
 
 		} // slice
 
-		debug_printf("total_endpoint_pred_missed: %u (%3.2f%%) total_endpoint_pred_hit: %u (%3.2f%%), total_block_endpoints_remapped: %u (%3.2f%%)\n", 
-			total_endpoint_pred_missed, total_endpoint_pred_missed * 100.0f / get_total_blocks(), 
+		debug_printf("total_endpoint_pred_missed: %u (%3.2f%%) total_endpoint_pred_hit: %u (%3.2f%%), total_block_endpoints_remapped: %u (%3.2f%%)\n",
+			total_endpoint_pred_missed, total_endpoint_pred_missed * 100.0f / get_total_blocks(),
 			total_endpoint_pred_hits, total_endpoint_pred_hits * 100.0f / get_total_blocks(),
 			total_block_endpoints_remapped, total_block_endpoints_remapped * 100.0f / get_total_blocks());
 
 		reoptimize_and_sort_endpoints_codebook(total_block_endpoints_remapped, all_endpoint_indices);
-				
+
 		sort_selector_codebook();
+		check_for_valid_cr_blocks();
 	}
 
 	void basisu_backend::compute_slice_crcs()
@@ -482,22 +605,22 @@
 				{
 					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
 
-					encoder_block &m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
 
 					{
-						etc_block &output_block = *(etc_block *)gi.get_block_ptr(block_x, block_y);
+						etc_block& output_block = *(etc_block*)gi.get_block_ptr(block_x, block_y);
 
 						output_block.set_diff_bit(true);
 						output_block.set_flip_bit(true);
-												
+
 						const uint32_t endpoint_index = m.m_endpoint_index;
-						
+
 						output_block.set_block_color5_etc1s(m_endpoint_palette[endpoint_index].m_color5);
 						output_block.set_inten_tables_etc1s(m_endpoint_palette[endpoint_index].m_inten5);
 
 						const uint32_t selector_idx = m.m_selector_index;
 
-						const basist::etc1_selector_palette_entry &selectors = m_selector_palette[selector_idx];
+						const basist::etc1_selector_palette_entry& selectors = m_selector_palette[selector_idx];
 						for (uint32_t sy = 0; sy < 4; sy++)
 							for (uint32_t sx = 0; sx < 4; sx++)
 								output_block.set_selector(sx, sy, selectors(sx, sy));
@@ -521,23 +644,24 @@
 #endif				
 				save_png(buf, gi_unpacked);
 			}
-			 
+
 		} // slice_index
 	}
 
 	// TODO: Split this into multiple methods.
 	bool basisu_backend::encode_image()
 	{
-		basisu_frontend &r = *m_pFront_end;
+		basisu_frontend& r = *m_pFront_end;
+		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
 
 		uint32_t total_used_selector_history_buf = 0;
 		uint32_t total_selector_indices_remapped = 0;
 
 		basist::approx_move_to_front selector_history_buf(basist::MAX_SELECTOR_HISTORY_BUF_SIZE);
 		histogram selector_history_buf_histogram(basist::MAX_SELECTOR_HISTORY_BUF_SIZE);
-		histogram selector_histogram(r.get_total_selector_clusters() + basist::MAX_SELECTOR_HISTORY_BUF_SIZE +  1);
+		histogram selector_histogram(r.get_total_selector_clusters() + basist::MAX_SELECTOR_HISTORY_BUF_SIZE + 1);
 		histogram selector_history_buf_rle_histogram(1 << basist::SELECTOR_HISTORY_BUF_RLE_COUNT_BITS);
-				
+
 		std::vector<uint_vec> selector_syms(m_slices.size());
 
 		const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = r.get_total_selector_clusters();
@@ -548,14 +672,16 @@
 		histogram delta_endpoint_histogram(r.get_total_endpoint_clusters());
 
 		histogram endpoint_pred_histogram(basist::ENDPOINT_PRED_TOTAL_SYMBOLS);
-		std::vector<uint_vec> endpoint_pred_syms(m_slices.size());		
-						
+		std::vector<uint_vec> endpoint_pred_syms(m_slices.size());
+
 		uint32_t total_endpoint_indices_remapped = 0;
 
 		uint_vec block_endpoint_indices, block_selector_indices;
 
 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
+			const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1;
+			const int next_frame_slice_index = is_video ? find_video_frame(slice_index, 1) : -1;
 			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
 			const uint32_t width = m_slices[slice_index].m_width;
 			const uint32_t height = m_slices[slice_index].m_height;
@@ -565,9 +691,9 @@
 			selector_history_buf.reset();
 
 			int selector_history_buf_rle_count = 0;
-						
+
 			int prev_endpoint_pred_sym_bits = -1, endpoint_pred_repeat_count = 0;
-						
+
 			uint32_t prev_endpoint_index = 0;
 
 			vector2D<uint8_t> block_endpoints_are_referenced(num_blocks_x, num_blocks_y);
@@ -578,24 +704,32 @@
 				{
 					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
 
-					encoder_block &m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
 
 					if (m.m_endpoint_predictor == 0)
 						block_endpoints_are_referenced(block_x - 1, block_y) = true;
 					else if (m.m_endpoint_predictor == 1)
 						block_endpoints_are_referenced(block_x, block_y - 1) = true;
 					else if (m.m_endpoint_predictor == 2)
-						block_endpoints_are_referenced(block_x - 1, block_y - 1) = true;
-				}
-			}
-						
+					{
+						if (!is_video)
+							block_endpoints_are_referenced(block_x - 1, block_y - 1) = true;
+					}
+					if (is_video)
+					{
+						if (m.m_is_cr_target)
+							block_endpoints_are_referenced(block_x, block_y) = true;
+					}
+
+				}  // block_x
+			} // block_y
 			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
 				{
 					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
 
-					encoder_block &m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
 
 					if (((block_x & 1) == 0) && ((block_y & 1) == 0))
 					{
@@ -607,15 +741,15 @@
 							{
 								const uint32_t bx = block_x + x;
 								const uint32_t by = block_y + y;
-								
-								uint32_t pred = NO_ENDPOINT_PRED_INDEX;
+
+								uint32_t pred = basist::NO_ENDPOINT_PRED_INDEX;
 								if ((bx < num_blocks_x) && (by < num_blocks_y))
 									pred = m_slice_encoder_blocks[slice_index](bx, by).m_endpoint_predictor;
-									
+
 								endpoint_pred_cur_sym_bits |= (pred << (x * 2 + y * 4));
 							}
 						}
-					
+
 						if ((int)endpoint_pred_cur_sym_bits == prev_endpoint_pred_sym_bits)
 						{
 							endpoint_pred_repeat_count++;
@@ -628,7 +762,7 @@
 								{
 									endpoint_pred_histogram.inc(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL);
 									endpoint_pred_syms[slice_index].push_back(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL);
-									
+
 									endpoint_pred_syms[slice_index].push_back(endpoint_pred_repeat_count);
 								}
 								else
@@ -652,26 +786,26 @@
 
 					int new_endpoint_index = m_endpoint_remap_table_old_to_new[m.m_endpoint_index];
 
-					if (m.m_endpoint_predictor == NO_ENDPOINT_PRED_INDEX) 
+					if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX)
 					{
 						int endpoint_delta = new_endpoint_index - prev_endpoint_index;
-						
+
 						if ((m_params.m_endpoint_rdo_quality_thresh > 1.0f) && (iabs(endpoint_delta) > 1) && (!block_endpoints_are_referenced(block_x, block_y)))
 						{
-							const pixel_block &src_pixels = r.get_source_pixel_block(block_index);
+							const pixel_block& src_pixels = r.get_source_pixel_block(block_index);
 
 							etc_block etc_blk(r.get_output_block(block_index));
 
 							const uint64_t cur_err = etc_blk.evaluate_etc1_error(src_pixels.get_ptr(), r.get_params().m_perceptual);
 
-							if (cur_err) 
+							if (cur_err)
 							{
 								const float endpoint_remap_thresh = maximum(1.0f, m_params.m_endpoint_rdo_quality_thresh);
 								const uint64_t thresh_err = (uint64_t)(cur_err * endpoint_remap_thresh);
 
 								uint64_t best_trial_err = UINT64_MAX;
 								int best_trial_idx = 0;
-														
+
 								etc_block trial_etc_blk(etc_blk);
 
 								const int MAX_ENDPOINT_SEARCH_DIST = 32;
@@ -687,10 +821,10 @@
 									if (trial_idx == new_endpoint_index)
 										continue;
 
-									const etc1_endpoint_palette_entry &p = m_endpoint_palette[m_endpoint_remap_table_new_to_old[trial_idx]];
+									const etc1_endpoint_palette_entry& p = m_endpoint_palette[m_endpoint_remap_table_new_to_old[trial_idx]];
 									trial_etc_blk.set_block_color5_etc1s(p.m_color5);
 									trial_etc_blk.set_inten_tables_etc1s(p.m_inten5);
-								
+
 									uint64_t trial_err = trial_etc_blk.evaluate_etc1_error(src_pixels.get_ptr(), r.get_params().m_perceptual);
 
 									if (trial_err <= thresh_err)
@@ -708,14 +842,14 @@
 									m.m_endpoint_index = m_endpoint_remap_table_new_to_old[best_trial_idx];
 
 									new_endpoint_index = best_trial_idx;
-									
+
 									endpoint_delta = new_endpoint_index - prev_endpoint_index;
 
 									total_endpoint_indices_remapped++;
 								}
 							}
 						}
-						
+
 						if (endpoint_delta < 0)
 							endpoint_delta += (int)r.get_total_endpoint_clusters();
 
@@ -723,16 +857,32 @@
 					}
 
 					block_endpoint_indices.push_back(m_endpoint_remap_table_new_to_old[new_endpoint_index]);
-					
+
 					prev_endpoint_index = new_endpoint_index;
-																			
+
+					if ((!is_video) || (m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX))
 					{
 						int new_selector_index = m_selector_remap_table_old_to_new[m.m_selector_index];
 
 						int selector_history_buf_index = -1;
 
+						if (m.m_is_cr_target)
 						{
-							const pixel_block &src_pixels = r.get_source_pixel_block(block_index);
+							for (uint32_t j = 0; j < selector_history_buf.size(); j++)
+							{
+								const int trial_idx = selector_history_buf[j];
+								if (trial_idx == new_selector_index)
+								{
+									total_used_selector_history_buf++;
+									selector_history_buf_index = j;
+									selector_history_buf_histogram.inc(j);
+									break;
+								}
+							}
+						}
+						else
+						{
+							const pixel_block& src_pixels = r.get_source_pixel_block(block_index);
 
 							etc_block etc_blk(r.get_output_block(block_index));
 
@@ -742,43 +892,55 @@
 							uint64_t cur_err = 0;
 							for (uint32_t p = 0; p < 16; p++)
 								cur_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
-														
+
 							uint64_t best_trial_err = UINT64_MAX;
 							int best_trial_idx = 0;
 							uint32_t best_trial_history_buf_idx = 0;
 
-							etc_block best_trial_etc_block;
 
 							const float selector_remap_thresh = maximum(1.0f, m_params.m_selector_rdo_quality_thresh); //2.5f;
+							const bool use_strict_search = (m_params.m_compression_level == 0) && (selector_remap_thresh == 1.0f);
 
 							for (uint32_t j = 0; j < selector_history_buf.size(); j++)
 							{
-								int trial_idx = selector_history_buf[j];
+								const int trial_idx = selector_history_buf[j];
 
-								for (uint32_t sy = 0; sy < 4; sy++)
-									for (uint32_t sx = 0; sx < 4; sx++)
-										etc_blk.set_selector(sx, sy, m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](sx, sy));
-
-								// TODO: Optimize this
-								unpack_etc1(etc_blk, etc_blk_unpacked);
-
-								uint64_t trial_err = 0;
-								const uint64_t thresh_err = minimum((uint64_t)ceilf(cur_err * selector_remap_thresh), best_trial_err);																
-								for (uint32_t p = 0; p < 16; p++)
+								if (use_strict_search)
 								{
-									trial_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
-									if (trial_err > thresh_err)
-										break;
-								}
-
-								if (trial_err <= cur_err * selector_remap_thresh)
-								{
-									if (trial_err < best_trial_err)
+									if (trial_idx == new_selector_index)
 									{
-										best_trial_err = trial_err;
+										best_trial_err = 0;
 										best_trial_idx = trial_idx;
-										best_trial_etc_block = etc_blk;
 										best_trial_history_buf_idx = j;
+										break;
+									}
+								}
+								else
+								{
+									for (uint32_t sy = 0; sy < 4; sy++)
+										for (uint32_t sx = 0; sx < 4; sx++)
+											etc_blk.set_selector(sx, sy, m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](sx, sy));
+
+									// TODO: Optimize this
+									unpack_etc1(etc_blk, etc_blk_unpacked);
+
+									uint64_t trial_err = 0;
+									const uint64_t thresh_err = minimum((uint64_t)ceilf(cur_err * selector_remap_thresh), best_trial_err);
+									for (uint32_t p = 0; p < 16; p++)
+									{
+										trial_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+										if (trial_err > thresh_err)
+											break;
+									}
+
+									if (trial_err <= cur_err * selector_remap_thresh)
+									{
+										if (trial_err < best_trial_err)
+										{
+											best_trial_err = trial_err;
+											best_trial_idx = trial_idx;
+											best_trial_history_buf_idx = j;
+										}
 									}
 								}
 							}
@@ -789,7 +951,7 @@
 									total_selector_indices_remapped++;
 
 								new_selector_index = best_trial_idx;
-																
+
 								total_used_selector_history_buf++;
 
 								selector_history_buf_index = best_trial_history_buf_idx;
@@ -799,8 +961,7 @@
 						} // if (m_params.m_selector_rdo_quality_thresh > 0.0f)
 
 						m.m_selector_index = m_selector_remap_table_new_to_old[new_selector_index];
-						
-						block_selector_indices.push_back(m.m_selector_index);
+
 
 						if ((selector_history_buf_rle_count) && (selector_history_buf_index != 0))
 						{
@@ -853,24 +1014,25 @@
 						}
 
 						m.m_selector_history_buf_index = selector_history_buf_index;
-												
+
 						if (selector_history_buf_index < 0)
 							selector_history_buf.add(new_selector_index);
 						else if (selector_history_buf.size())
 							selector_history_buf.use(selector_history_buf_index);
 					}
+					block_selector_indices.push_back(m.m_selector_index);
 
 				} // block_x
 
 			} // block_y
-						
+
 			if (endpoint_pred_repeat_count > 0)
 			{
 				if (endpoint_pred_repeat_count > (int)basist::ENDPOINT_PRED_MIN_REPEAT_COUNT)
 				{
 					endpoint_pred_histogram.inc(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL);
 					endpoint_pred_syms[slice_index].push_back(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL);
-									
+
 					endpoint_pred_syms[slice_index].push_back(endpoint_pred_repeat_count);
 				}
 				else
@@ -914,12 +1076,12 @@
 
 				selector_history_buf_rle_count = 0;
 			}
-			
+
 		} // slice_index
 
 		debug_printf("Endpoint pred RDO total endpoint indices remapped: %u %3.2f%%\n",
 			total_endpoint_indices_remapped, total_endpoint_indices_remapped * 100.0f / get_total_blocks());
-		
+
 		debug_printf("Selector history RDO total selector indices remapped: %u %3.2f%%, Used history buf: %u %3.2f%%\n",
 			total_selector_indices_remapped, total_selector_indices_remapped * 100.0f / get_total_blocks(),
 			total_used_selector_history_buf, total_used_selector_history_buf * 100.0f / get_total_blocks());
@@ -928,18 +1090,21 @@
 		{
 			int_vec unused;
 			r.reoptimize_remapped_endpoints(block_endpoint_indices, unused, false, &block_selector_indices);
-		
+
 			create_endpoint_palette();
 		}
 
+		check_for_valid_cr_blocks();
 		compute_slice_crcs();
-				
+
 		double endpoint_pred_entropy = endpoint_pred_histogram.get_entropy() / endpoint_pred_histogram.get_total();
 		double delta_endpoint_entropy = delta_endpoint_histogram.get_entropy() / delta_endpoint_histogram.get_total();
 		double selector_entropy = selector_histogram.get_entropy() / selector_histogram.get_total();
-		
+
 		debug_printf("Histogram entropy: EndpointPred: %3.3f DeltaEndpoint: %3.3f DeltaSelector: %3.3f\n", endpoint_pred_entropy, delta_endpoint_entropy, selector_entropy);
 
+		if (!endpoint_pred_histogram.get_total())
+			endpoint_pred_histogram.inc(0);
 		huffman_encoding_table endpoint_pred_model;
 		if (!endpoint_pred_model.init(endpoint_pred_histogram, 16))
 		{
@@ -947,12 +1112,16 @@
 			return false;
 		}
 
+		if (!delta_endpoint_histogram.get_total())
+			delta_endpoint_histogram.inc(0);
 		huffman_encoding_table delta_endpoint_model;
 		if (!delta_endpoint_model.init(delta_endpoint_histogram, 16))
 		{
 			error_printf("delta_endpoint_model.init() failed!");
 			return false;
 		}
+		if (!selector_histogram.get_total())
+			selector_histogram.inc(0);
 
 		huffman_encoding_table selector_model;
 		if (!selector_model.init(selector_histogram, 16))
@@ -980,7 +1149,7 @@
 		uint32_t selector_history_buf_run_sym_bits = coder.emit_huffman_table(selector_history_buf_rle_model);
 
 		coder.put_bits(basist::MAX_SELECTOR_HISTORY_BUF_SIZE, 13);
-		
+
 		debug_printf("Model sizes: EndpointPred: %u bits %u bytes (%3.3f bpp) DeltaEndpoint: %u bits %u bytes (%3.3f bpp) Selector: %u bits %u bytes (%3.3f bpp) SelectorHistBufRLE: %u bits %u bytes (%3.3f bpp)\n",
 			endpoint_pred_model_bits, (endpoint_pred_model_bits + 7) / 8, endpoint_pred_model_bits / float(get_total_input_texels()),
 			delta_endpoint_bits, (delta_endpoint_bits + 7) / 8, delta_endpoint_bits / float(get_total_input_texels()),
@@ -1011,14 +1180,14 @@
 
 			int endpoint_pred_repeat_count = 0;
 			uint32_t cur_endpoint_pred_sym_ofs = 0;
-			uint32_t prev_endpoint_pred_sym = 0;
+//			uint32_t prev_endpoint_pred_sym = 0;
 			uint32_t prev_endpoint_index = 0;
-						
+
 			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
 				{
-					const encoder_block &m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					const encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
 
 					if (((block_x & 1) == 0) && ((block_y & 1) == 0))
 					{
@@ -1036,23 +1205,23 @@
 
 								endpoint_pred_repeat_count = endpoint_pred_syms[slice_index][cur_endpoint_pred_sym_ofs++];
 								assert(endpoint_pred_repeat_count >= (int)basist::ENDPOINT_PRED_MIN_REPEAT_COUNT);
-								
-								total_endpoint_pred_bits += coder.put_vlc(endpoint_pred_repeat_count - basist::ENDPOINT_PRED_MIN_REPEAT_COUNT, basist::ENDPOINT_PRED_COUNT_VLC_BITS);	
+
+								total_endpoint_pred_bits += coder.put_vlc(endpoint_pred_repeat_count - basist::ENDPOINT_PRED_MIN_REPEAT_COUNT, basist::ENDPOINT_PRED_COUNT_VLC_BITS);
 
 								endpoint_pred_repeat_count--;
 							}
 							else
 							{
 								total_endpoint_pred_bits += coder.put_code(sym, endpoint_pred_model);
-								
-								prev_endpoint_pred_sym = sym;
+
+								//prev_endpoint_pred_sym = sym;
 							}
 						}
 					}
 
 					const int new_endpoint_index = m_endpoint_remap_table_old_to_new[m.m_endpoint_index];
 
-					if (m.m_endpoint_predictor == NO_ENDPOINT_PRED_INDEX)
+					if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX)
 					{
 						int endpoint_delta = new_endpoint_index - prev_endpoint_index;
 						if (endpoint_delta < 0)
@@ -1062,33 +1231,36 @@
 					}
 
 					prev_endpoint_index = new_endpoint_index;
-					
-					if (!selector_rle_count)
+
+					if ((!is_video) || (m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX))
 					{
-						uint32_t selector_sym_index = selector_syms[slice_index][cur_selector_sym_ofs++];
-
-						if (selector_sym_index == SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX)
-							selector_rle_count = selector_syms[slice_index][cur_selector_sym_ofs++];
-
-						total_selector_bits += coder.put_code(selector_sym_index, selector_model);
-
-						if (selector_sym_index == SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX)
+						if (!selector_rle_count)
 						{
-							int run_sym = selector_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
-							if (run_sym >= ((int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1))
-							{
-								total_selector_bits += coder.put_code(basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1, selector_history_buf_rle_model);
-									
-								uint32_t n = selector_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
-								total_selector_bits += coder.put_vlc(n, 7);
-							}
-							else
-								total_selector_bits += coder.put_code(run_sym, selector_history_buf_rle_model);
-						}
-					}
+							uint32_t selector_sym_index = selector_syms[slice_index][cur_selector_sym_ofs++];
 
-					if (selector_rle_count)
-						selector_rle_count--;
+							if (selector_sym_index == SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX)
+								selector_rle_count = selector_syms[slice_index][cur_selector_sym_ofs++];
+
+							total_selector_bits += coder.put_code(selector_sym_index, selector_model);
+
+							if (selector_sym_index == SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX)
+							{
+								int run_sym = selector_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+								if (run_sym >= ((int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1))
+								{
+									total_selector_bits += coder.put_code(basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1, selector_history_buf_rle_model);
+
+									uint32_t n = selector_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+									total_selector_bits += coder.put_vlc(n, 7);
+								}
+								else
+									total_selector_bits += coder.put_code(run_sym, selector_history_buf_rle_model);
+							}
+						}
+
+						if (selector_rle_count)
+							selector_rle_count--;
+					}
 
 				} // block_x
 
@@ -1096,7 +1268,7 @@
 
 			BASISU_BACKEND_VERIFY(cur_endpoint_pred_sym_ofs == endpoint_pred_syms[slice_index].size());
 			BASISU_BACKEND_VERIFY(cur_selector_sym_ofs == selector_syms[slice_index].size());
-			
+
 			coder.flush();
 
 			m_output.m_slice_image_data[slice_index] = coder.get_bytes();
@@ -1122,8 +1294,8 @@
 
 	bool basisu_backend::encode_endpoint_palette()
 	{
-		const basisu_frontend &r = *m_pFront_end;
-		
+		const basisu_frontend& r = *m_pFront_end;
+
 		// Maps NEW to OLD endpoints
 		uint_vec endpoint_remap_table_inv(r.get_total_endpoint_clusters());
 		for (uint32_t old_endpoint_index = 0; old_endpoint_index < m_endpoint_remap_table_old_to_new.size(); old_endpoint_index++)
@@ -1141,23 +1313,23 @@
 				break;
 			}
 		}
-		
+
 		histogram color5_delta_hist0(32); // prev 0-9, delta is -9 to 31
 		histogram color5_delta_hist1(32); // prev 10-21, delta is -21 to 21
 		histogram color5_delta_hist2(32); // prev 22-31, delta is -31 to 9
 		histogram inten_delta_hist(8);
-										
+
 		color_rgba prev_color5(16, 16, 16, 0);
 		uint32_t prev_inten = 0;
-						
+
 		for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++)
 		{
 			const uint32_t old_endpoint_index = endpoint_remap_table_inv[new_endpoint_index];
-						
+
 			int delta_inten = m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten;
 			inten_delta_hist.inc(delta_inten & 7);
 			prev_inten = m_endpoint_palette[old_endpoint_index].m_inten5;
-			
+
 			for (uint32_t i = 0; i < (is_grayscale ? 1U : 3U); i++)
 			{
 				const int delta = (m_endpoint_palette[old_endpoint_index].m_color5[i] - prev_color5[i]) & 31;
@@ -1176,7 +1348,7 @@
 		if (!color5_delta_hist0.get_total()) color5_delta_hist0.inc(0);
 		if (!color5_delta_hist1.get_total()) color5_delta_hist1.inc(0);
 		if (!color5_delta_hist2.get_total()) color5_delta_hist2.inc(0);
-						
+
 		huffman_encoding_table color5_delta_model0, color5_delta_model1, color5_delta_model2, inten_delta_model;
 		if (!color5_delta_model0.init(color5_delta_hist0, 16))
 		{
@@ -1201,7 +1373,7 @@
 			error_printf("inten3_model.init() failed!");
 			return false;
 		}
-												
+
 		bitwise_coder coder;
 
 		coder.init(8192);
@@ -1210,7 +1382,7 @@
 		coder.emit_huffman_table(color5_delta_model1);
 		coder.emit_huffman_table(color5_delta_model2);
 		coder.emit_huffman_table(inten_delta_model);
-		
+
 		coder.put_bits(is_grayscale, 1);
 
 		prev_color5.set(16, 16, 16, 0);
@@ -1219,11 +1391,11 @@
 		for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++)
 		{
 			const uint32_t old_endpoint_index = endpoint_remap_table_inv[new_endpoint_index];
-						
+
 			int delta_inten = (m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten) & 7;
 			coder.put_code(delta_inten, inten_delta_model);
 			prev_inten = m_endpoint_palette[old_endpoint_index].m_inten5;
-						
+
 			for (uint32_t i = 0; i < (is_grayscale ? 1U : 3U); i++)
 			{
 				const int delta = (m_endpoint_palette[old_endpoint_index].m_color5[i] - prev_color5[i]) & 31;
@@ -1252,8 +1424,8 @@
 
 	bool basisu_backend::encode_selector_palette()
 	{
-		const basisu_frontend &r = *m_pFront_end;
-										
+		const basisu_frontend& r = *m_pFront_end;
+
 		if ((m_params.m_use_global_sel_codebook) && (!m_params.m_use_hybrid_sel_codebooks))
 		{
 			histogram global_mod_indices(1 << m_params.m_global_sel_codebook_mod_bits);
@@ -1296,7 +1468,7 @@
 				if (m_params.m_global_sel_codebook_mod_bits)
 					total_mod_bits += coder.put_code(m_global_selector_palette_desc[i].m_mod_index, global_mod_model);
 			}
-						
+
 			coder.flush();
 
 			m_output.m_selector_palette = coder.get_bytes();
@@ -1392,7 +1564,7 @@
 				if (!q)
 					continue;
 
-				const basist::etc1_selector_palette_entry &cur = m_selector_palette[m_selector_remap_table_new_to_old[q]];
+				const basist::etc1_selector_palette_entry& cur = m_selector_palette[m_selector_remap_table_new_to_old[q]];
 				const basist::etc1_selector_palette_entry predictor(m_selector_palette[m_selector_remap_table_new_to_old[q - 1]]);
 
 				for (uint32_t j = 0; j < 4; j++)
@@ -1428,7 +1600,7 @@
 					continue;
 				}
 
-				const basist::etc1_selector_palette_entry &cur = m_selector_palette[m_selector_remap_table_new_to_old[q]];
+				const basist::etc1_selector_palette_entry& cur = m_selector_palette[m_selector_remap_table_new_to_old[q]];
 				const basist::etc1_selector_palette_entry predictor(m_selector_palette[m_selector_remap_table_new_to_old[q - 1]]);
 
 				for (uint32_t j = 0; j < 4; j++)
@@ -1463,8 +1635,8 @@
 
 		}  // if (m_params.m_use_global_sel_codebook)        
 
-		debug_printf("Selector codebook bits: %u bytes: %u, Bits per entry: %3.1f, Avg bits/texel: %3.3f\n", 
-			(int)m_output.m_selector_palette.size() * 8, (int)m_output.m_selector_palette.size(), 
+		debug_printf("Selector codebook bits: %u bytes: %u, Bits per entry: %3.1f, Avg bits/texel: %3.3f\n",
+			(int)m_output.m_selector_palette.size() * 8, (int)m_output.m_selector_palette.size(),
 			m_output.m_selector_palette.size() * 8.0f / r.get_total_selector_clusters(), m_output.m_selector_palette.size() * 8.0f / get_total_input_texels());
 
 		return true;
@@ -1472,9 +1644,10 @@
 
 	uint32_t basisu_backend::encode()
 	{
+		const bool is_video = m_pFront_end->get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
 		m_output.m_slice_desc = m_slices;
 		m_output.m_etc1s = m_params.m_etc1s;
-				
+
 		create_endpoint_palette();
 		create_selector_palette();
 
diff --git a/basisu_backend.h b/basisu_backend.h
index cd1f020..e8f783e 100644
--- a/basisu_backend.h
+++ b/basisu_backend.h
@@ -36,6 +36,7 @@
 
 		int m_selector_history_buf_index;
 
+		bool m_is_cr_target;
 		void clear()
 		{
 			m_endpoint_predictor = 0;
@@ -44,6 +45,7 @@
 			m_selector_index = 0;
 						
 			m_selector_history_buf_index = 0;
+			m_is_cr_target = false;
 		}
 	};
 
@@ -105,6 +107,14 @@
 
 	struct basisu_backend_slice_desc
 	{
+		basisu_backend_slice_desc()
+		{
+			clear();
+		}
+		void clear()
+		{
+			clear_obj(*this);
+		}
 		uint32_t m_first_block_index;
 
 		uint32_t m_orig_width;
@@ -122,6 +132,7 @@
 		uint32_t m_source_file_index;		// also the basis image index
 		uint32_t m_mip_index;
 		bool m_alpha;
+		bool m_iframe;
 	};
 
 	typedef std::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;
@@ -308,6 +319,8 @@
 		bool encode_image();
 		bool encode_endpoint_palette();
 		bool encode_selector_palette();
+		int find_video_frame(int slice_index, int delta);
+		void check_for_valid_cr_blocks();
 	};
 
 } // namespace basisu
diff --git a/basisu_basis_file.cpp b/basisu_basis_file.cpp
index e4a73a0..19752c4 100644
--- a/basisu_basis_file.cpp
+++ b/basisu_basis_file.cpp
@@ -86,6 +86,8 @@
 			
 			if (slice_descs[i].m_alpha)
 				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsIsAlphaData;
+			if (slice_descs[i].m_iframe)
+				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsFrameIsIFrame;
 
 			m_images_descs[i].m_orig_width = slice_descs[i].m_orig_width;
 			m_images_descs[i].m_orig_height = slice_descs[i].m_orig_height;
diff --git a/basisu_comp.cpp b/basisu_comp.cpp
index 7826a42..89cf004 100644
--- a/basisu_comp.cpp
+++ b/basisu_comp.cpp
@@ -17,12 +17,13 @@
 #include <unordered_set>
 
 #define BASISU_USE_STB_IMAGE_RESIZE_FOR_MIPMAP_GEN 0
-#define DEBUG_RESIZE_TEXTURE_TO_64x64 (0)
+#define DEBUG_CROP_TEXTURE_TO_64x64 (0)
+#define DEBUG_RESIZE_TEXTURE (0)
 #define DEBUG_EXTRACT_SINGLE_BLOCK (0)
 
 namespace basisu
 {
-	basis_compressor::basis_compressor() :
+   basis_compressor::basis_compressor() :
 		m_total_blocks(0),
 		m_auto_global_sel_pal(false),
 		m_basis_file_size(0),
@@ -68,6 +69,8 @@
 			PRINT_BOOL_VALUE(m_check_for_alpha)
 			PRINT_BOOL_VALUE(m_force_alpha)
 			PRINT_BOOL_VALUE(m_seperate_rg_to_color_alpha);
+			PRINT_BOOL_VALUE(m_multithreading);
+			PRINT_BOOL_VALUE(m_disable_hierarchical_endpoint_codebooks);
 			
 			PRINT_FLOAT_VALUE(m_hybrid_sel_cb_quality_thresh);
 			
@@ -314,9 +317,14 @@
 			file_image = block_image;
 #endif
 
-#if DEBUG_RESIZE_TEXTURE_TO_64x64
+#if DEBUG_CROP_TEXTURE_TO_64x64
 			file_image.resize(64, 64);
 #endif
+#if DEBUG_RESIZE_TEXTURE
+			image temp_img((file_image.get_width() + 1) / 2, (file_image.get_height() + 1) / 2);
+			image_resample(file_image, temp_img, m_params.m_perceptual, "kaiser");
+			temp_img.swap(file_image);
+#endif
 
 			if ((!file_image.get_width()) || (!file_image.get_height()))
 			{
@@ -444,6 +452,11 @@
 				slice_desc.m_mip_index = mip_indices[slice_index];
 
 				slice_desc.m_alpha = is_alpha_slice;
+				slice_desc.m_iframe = false;
+				if (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames)
+				{
+					slice_desc.m_iframe = (source_file_index == 0);
+				}
 
 				m_total_blocks += slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
 				total_macroblocks += slice_desc.m_num_macroblocks_x * slice_desc.m_num_macroblocks_y;
@@ -487,8 +500,8 @@
 		{
 			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
 
-			printf("Slice: %u, alpha: %u, orig width/height: %ux%u, width/height: %ux%u, first_block: %u, image_index: %u, mip_level: %u\n", 
-				i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, slice_desc.m_width, slice_desc.m_height, slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index);
+			printf("Slice: %u, alpha: %u, orig width/height: %ux%u, width/height: %ux%u, first_block: %u, image_index: %u, mip_level: %u, iframe: %u\n", 
+				i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, slice_desc.m_width, slice_desc.m_height, slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index, slice_desc.m_iframe);
 
 			if (m_any_source_image_has_alpha)
 			{
@@ -522,6 +535,11 @@
 
 			if ((slice_desc.m_orig_width > slice_desc.m_width) || (slice_desc.m_orig_height > slice_desc.m_height))
 				return false;
+			if ((slice_desc.m_source_file_index == 0) && (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
+			{
+				if (!slice_desc.m_iframe)
+					return false;
+			}
 		}
 
 		return true;
@@ -778,7 +796,11 @@
 		p.m_debug_stats = m_params.m_debug;
 		p.m_debug_images = m_params.m_debug_images;
 		p.m_compression_level = m_params.m_compression_level;
-		
+		p.m_tex_type = m_params.m_tex_type;
+		p.m_multithreaded = m_params.m_multithreading;
+		p.m_disable_hierarchical_endpoint_codebooks = m_params.m_disable_hierarchical_endpoint_codebooks;
+		p.m_pJob_pool = m_params.m_pJob_pool;
+
 		if ((m_params.m_global_sel_pal) || (m_auto_global_sel_pal))
 		{
 			p.m_pGlobal_sel_codebook = m_params.m_pSel_codebook;
diff --git a/basisu_comp.h b/basisu_comp.h
index 415335e..5311d0b 100644
--- a/basisu_comp.h
+++ b/basisu_comp.h
@@ -186,7 +186,8 @@
 			m_quality_level(-1),
 			m_mip_scale(1.0f, .000125f, 4.0f),
 			m_mip_smallest_dimension(1, 1, 16384),
-			m_compression_level((int)BASISU_DEFAULT_COMPRESSION_LEVEL, 0, (int)BASISU_MAX_COMPRESSION_LEVEL)
+			m_compression_level((int)BASISU_DEFAULT_COMPRESSION_LEVEL, 0, (int)BASISU_MAX_COMPRESSION_LEVEL),
+			m_pJob_pool(nullptr)
 		{
 			clear();
 		}
@@ -217,10 +218,12 @@
 			m_compute_stats.clear();
 			m_check_for_alpha.clear();
 			m_force_alpha.clear();
+			m_multithreading.clear();
 			m_seperate_rg_to_color_alpha.clear();
 			m_hybrid_sel_cb_quality_thresh.clear();
 			m_global_pal_bits.clear();
 			m_global_mod_bits.clear();
+			m_disable_hierarchical_endpoint_codebooks.clear();
 
 			m_no_endpoint_rdo.clear();
 			m_endpoint_rdo_thresh.clear();
@@ -243,6 +246,8 @@
 			m_userdata0 = 0;
 			m_userdata1 = 0;
 			m_us_per_frame = 0;
+
+			m_pJob_pool = nullptr;
 		}
 
 		// Pointer to the global selector codebook, or nullptr to not use a global selector codebook
@@ -303,10 +308,13 @@
 		
 		// Always put alpha slices in the output basis file, even when the input doesn't have alpha
 		bool_param<false> m_force_alpha; 
+		bool_param<true> m_multithreading;
 		
 		// Split the R channel to RGB and the G channel to alpha, then write a basis file with alpha channels
 		bool_param<false> m_seperate_rg_to_color_alpha;
 
+		bool_param<false> m_disable_hierarchical_endpoint_codebooks;
+
 		// Global/hybrid selector codebook parameters
 		param<float> m_hybrid_sel_cb_quality_thresh;
 		param<int> m_global_pal_bits;
@@ -334,6 +342,8 @@
 		uint32_t m_userdata0;
 		uint32_t m_userdata1;
 		uint32_t m_us_per_frame;
+
+		job_pool *m_pJob_pool;
 	};
 	
 	class basis_compressor
diff --git a/basisu_enc.cpp b/basisu_enc.cpp
index 48f724d..ba374b3 100644
--- a/basisu_enc.cpp
+++ b/basisu_enc.cpp
@@ -1197,4 +1197,181 @@
 		}
 	}
 
+	uint32_t hash_hsieh(const uint8_t *pBuf, size_t len)
+	{
+		if (!pBuf || !len) 
+			return 0;
+
+		uint32_t h = static_cast<uint32_t>(len);
+
+		const uint32_t bytes_left = len & 3;
+		len >>= 2;
+
+		while (len--)
+		{
+			const uint16_t *pWords = reinterpret_cast<const uint16_t *>(pBuf);
+
+			h += pWords[0];
+			
+			const uint32_t t = (pWords[1] << 11) ^ h;
+			h = (h << 16) ^ t;
+			
+			pBuf += sizeof(uint32_t);
+			
+			h += h >> 11;
+		}
+
+		switch (bytes_left)
+		{
+		case 1: 
+			h += *reinterpret_cast<const signed char*>(pBuf);
+			h ^= h << 10;
+			h += h >> 1;
+			break;
+		case 2: 
+			h += *reinterpret_cast<const uint16_t *>(pBuf);
+			h ^= h << 11;
+			h += h >> 17;
+			break;
+		case 3:
+			h += *reinterpret_cast<const uint16_t *>(pBuf);
+			h ^= h << 16;
+			h ^= (static_cast<signed char>(pBuf[sizeof(uint16_t)])) << 18;
+			h += h >> 11;
+			break;
+		default:
+			break;
+		}
+		
+		h ^= h << 3;
+		h += h >> 5;
+		h ^= h << 4;
+		h += h >> 17;
+		h ^= h << 25;
+		h += h >> 6;
+
+		return h;
+	}
+
+	job_pool::job_pool(uint32_t num_threads) : 
+		m_kill_flag(false),
+		m_num_active_jobs(0)
+	{
+		assert(num_threads >= 1U);
+
+		debug_printf("job_pool::job_pool: %u total threads\n", num_threads);
+
+		if (num_threads > 1)
+		{
+			m_threads.resize(num_threads - 1);
+
+			for (int i = 0; i < ((int)num_threads - 1); i++)
+			   m_threads[i] = std::thread([this, i] { job_thread(i); });
+		}
+	}
+
+	job_pool::~job_pool()
+	{
+		debug_printf("job_pool::~job_pool\n");
+		
+		// Notify all workers that they need to die right now.
+		m_kill_flag = true;
+		
+		m_has_work.notify_all();
+
+		// Wait for all workers to die.
+		for (uint32_t i = 0; i < m_threads.size(); i++)
+			m_threads[i].join();
+	}
+				
+	void job_pool::add_job(const std::function<void()>& job)
+	{
+		std::unique_lock<std::mutex> lock(m_mutex);
+
+		m_queue.emplace_back(job);
+
+		const size_t queue_size = m_queue.size();
+
+		lock.unlock();
+
+		if (queue_size > 1)
+			m_has_work.notify_one();
+	}
+
+	void job_pool::add_job(std::function<void()>&& job)
+	{
+		std::unique_lock<std::mutex> lock(m_mutex);
+
+		m_queue.emplace_back(std::move(job));
+
+		const size_t queue_size = m_queue.size();
+
+		lock.unlock();
+
+		if (queue_size > 1)
+			m_has_work.notify_one();
+	}
+
+	void job_pool::wait_for_all()
+	{
+		std::unique_lock<std::mutex> lock(m_mutex);
+
+		// Drain the job queue on the calling thread.
+		while (!m_queue.empty())
+		{
+			std::function<void()> job(m_queue.back());
+			m_queue.pop_back();
+
+			lock.unlock();
+
+			job();
+
+			lock.lock();
+		}
+
+		// The queue is empty, now wait for all active jobs to finish up.
+		m_no_more_jobs.wait(lock, [this]{ return !m_num_active_jobs; } );
+	}
+
+	void job_pool::job_thread(uint32_t index)
+	{
+		debug_printf("job_pool::job_thread: starting %u\n", index);
+		
+		while (true)
+		{
+			std::unique_lock<std::mutex> lock(m_mutex);
+
+			// Wait for any jobs to be issued.
+			m_has_work.wait(lock, [this] { return m_kill_flag || m_queue.size(); } );
+
+			// Check to see if we're supposed to exit.
+			if (m_kill_flag)
+				break;
+
+			// Get the job and execute it.
+			std::function<void()> job(m_queue.back());
+			m_queue.pop_back();
+
+			++m_num_active_jobs;
+
+			lock.unlock();
+
+			job();
+
+			lock.lock();
+
+			--m_num_active_jobs;
+
+			// Now check if there are no more jobs remaining. 
+			const bool all_done = m_queue.empty() && !m_num_active_jobs;
+			
+			lock.unlock();
+
+			if (all_done)
+				m_no_more_jobs.notify_all();
+		}
+
+		debug_printf("job_pool::job_thread: exiting\n");
+	}
+
 } // namespace basisu
diff --git a/basisu_enc.h b/basisu_enc.h
index 0a9e721..157e717 100644
--- a/basisu_enc.h
+++ b/basisu_enc.h
@@ -17,6 +17,13 @@
 #include "basisu_enc.h"
 #include "transcoder/basisu_transcoder_internal.h"
 
+#include <mutex>
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <thread>
+#include <unordered_map>
+
 #ifndef _WIN32
 #include <libgen.h>
 #endif
@@ -36,6 +43,42 @@
 	{
 		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
 	}
+	
+	// Hashing
+	
+	inline uint32_t bitmix32c(uint32_t v) 
+	{
+		v = (v + 0x7ed55d16) + (v << 12);
+		v = (v ^ 0xc761c23c) ^ (v >> 19);
+		v = (v + 0x165667b1) + (v << 5);
+		v = (v + 0xd3a2646c) ^ (v << 9);
+		v = (v + 0xfd7046c5) + (v << 3);
+		v = (v ^ 0xb55a4f09) ^ (v >> 16);
+		return v;
+	}
+
+	inline uint32_t bitmix32(uint32_t v) 
+	{
+		v -= (v << 6);
+		v ^= (v >> 17);
+		v -= (v << 9);
+		v ^= (v << 4);
+		v -= (v << 3);
+		v ^= (v << 10);
+		v ^= (v >> 15);
+		return v;
+	}
+
+	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);
+
+	template <typename Key>
+	struct bit_hasher
+	{
+		std::size_t operator()(const Key& k) const
+		{
+			return hash_hsieh(reinterpret_cast<const uint8_t *>(&k), sizeof(k));
+		}
+	};
 
 	// Linear algebra
 
@@ -165,8 +208,10 @@
 		inline T length() const { return sqrt(norm()); }
 
 		inline T squared_distance(const vec &other) const { T d2 = 0; for (uint32_t i = 0; i < N; i++) { T d = m_v[i] - other.m_v[i]; d2 += d * d; } return d2; }
+		inline double squared_distance_d(const vec& other) const { double d2 = 0; for (uint32_t i = 0; i < N; i++) { double d = (double)m_v[i] - (double)other.m_v[i]; d2 += d * d; } return d2; }
 
-		inline T distance(const vec &other) const { return squared_distance(other); }
+		inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); }
+		inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); }
 
 		inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len);	return *this; }
 
@@ -176,6 +221,22 @@
 				m_v[i] = basisu::clamp(m_v[i], l, h);
 			return *this;
 		}
+
+		static vec component_min(const vec& a, const vec& b)
+		{
+			vec res;
+			for (uint32_t i = 0; i < N; i++)
+				res[i] = minimum(a[i], b[i]);
+			return res;
+		}
+
+		static vec component_max(const vec& a, const vec& b)
+		{
+			vec res;
+			for (uint32_t i = 0; i < N; i++)
+				res[i] = maximum(a[i], b[i]);
+			return res;
+		}
 	};
 
 	typedef vec<4, double> vec4D;
@@ -290,6 +351,37 @@
 			[pKeys](uint32_t a, uint32_t b) { return pKeys[a] < pKeys[b]; }
 		);
 	}
+	
+	// Very simple job pool with no dependencies.
+	class job_pool
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool);
+
+	public:
+		job_pool(uint32_t num_threads);
+		~job_pool();
+				
+		void add_job(const std::function<void()>& job);
+		void add_job(std::function<void()>&& job);
+
+		void wait_for_all();
+
+		size_t get_total_threads() const { return 1 + m_threads.size(); }
+		
+	private:
+		std::vector<std::thread> m_threads;
+        std::vector<std::function<void()> > m_queue;
+		
+		std::mutex m_mutex;
+		std::condition_variable m_has_work;
+		std::condition_variable m_no_more_jobs;
+		
+		uint32_t m_num_active_jobs;
+		
+		std::atomic<bool> m_kill_flag;
+
+		void job_thread(uint32_t index);
+	};
 
 	// Simple 32-bit color class
 
@@ -867,7 +959,8 @@
 	class tree_vector_quant
 	{
 	public:
-		typedef std::pair<TrainingVectorType, uint32_t> training_vec_with_weight;
+		typedef TrainingVectorType training_vec_type;
+		typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;
 		typedef std::vector< training_vec_with_weight > array_of_weighted_training_vecs;
 
 		tree_vector_quant() :
@@ -882,7 +975,11 @@
 			m_next_codebook_index = 0;
 		}
 
-		void add_training_vec(const TrainingVectorType &v, uint32_t weight) { m_training_vecs.push_back(std::make_pair(v, weight)); }
+		void add_training_vec(const TrainingVectorType &v, uint64_t weight) { m_training_vecs.push_back(std::make_pair(v, weight)); }
+
+		size_t get_total_training_vecs() const { return m_training_vecs.size(); }
+		const array_of_weighted_training_vecs &get_training_vecs() const	{ return m_training_vecs; }
+				array_of_weighted_training_vecs &get_training_vecs()			{ return m_training_vecs; }
 
 		void retrieve(std::vector< std::vector<uint32_t> > &codebook) const
 		{
@@ -1025,7 +1122,7 @@
 			for (uint32_t i = 0; i < m_training_vecs.size(); i++)
 			{
 				const TrainingVectorType &v = m_training_vecs[i].first;
-				const uint32_t weight = m_training_vecs[i].second;
+				const uint64_t weight = m_training_vecs[i].second;
 
 				root.m_training_vecs.push_back(i);
 
@@ -1049,7 +1146,8 @@
 			float l_var = 0.0f, r_var = 0.0f;
 
 			// Compute initial left/right child origins
-			prep_split(m_nodes[node_index], l_child_org, r_child_org);
+			if (!prep_split(m_nodes[node_index], l_child_org, r_child_org))
+				return false;
 
 			// Use k-means iterations to refine these children vectors
 			if (!refine_split(m_nodes[node_index], l_child_org, l_weight, l_var, l_children, r_child_org, r_weight, r_var, r_children))
@@ -1071,6 +1169,34 @@
 			l_child.set(l_child_org, l_weight, l_var, l_children);
 			r_child.set(r_child_org, r_weight, r_var, r_children);
 
+			if ((l_child.m_var <= 0.0f) && (l_child.m_training_vecs.size() > 1))
+			{
+				TrainingVectorType v(m_training_vecs[l_child.m_training_vecs[0]].first);
+				
+				for (uint32_t i = 1; i < l_child.m_training_vecs.size(); i++)
+				{
+					if (!(v == m_training_vecs[l_child.m_training_vecs[i]].first))
+					{
+						l_child.m_var = 1e-4f;
+						break;
+					}
+				}
+			}
+
+			if ((r_child.m_var <= 0.0f) && (r_child.m_training_vecs.size() > 1))
+			{
+				TrainingVectorType v(m_training_vecs[r_child.m_training_vecs[0]].first);
+
+				for (uint32_t i = 1; i < r_child.m_training_vecs.size(); i++)
+				{
+					if (!(v == m_training_vecs[r_child.m_training_vecs[i]].first))
+					{
+						r_child.m_var = 1e-4f;
+						break;
+					}
+				}
+			}
+
 			if ((l_child.m_var > 0.0f) && (l_child.m_training_vecs.size() > 1))
 				var_heap.add_heap(l_child_index, l_var);
 
@@ -1111,7 +1237,7 @@
 			return compute_pca_from_covar<N, TrainingVectorType>(cmatrix);
 		}
 
-		void prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const
+		bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const
 		{
 			const uint32_t N = TrainingVectorType::num_elements;
 
@@ -1119,7 +1245,7 @@
 			{
 				l_child_result = m_training_vecs[node.m_training_vecs[0]].first;
 				r_child_result = m_training_vecs[node.m_training_vecs[1]].first;
-				return;
+				return true;
 			}
 
 			TrainingVectorType axis(compute_split_axis(node)), l_child(0.0f), r_child(0.0f);
@@ -1152,17 +1278,77 @@
 			}
 			else
 			{
-				// Empty cell problem
-				l_child_result = node.m_origin;
-				r_child_result = node.m_origin;
-
-				// Nudge the two cells apart and hope k-means can separate them.
-				for (uint32_t i = 0; i < N; i++)
+				TrainingVectorType l(1e+20f);
+				TrainingVectorType h(-1e+20f);
+				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
 				{
-					l_child_result[i] -= .000125f;
-					r_child_result[i] += .000125f;
+					const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
+					
+					l = TrainingVectorType::component_min(l, v);
+					h = TrainingVectorType::component_max(h, v);
+				}
+
+				TrainingVectorType r(h - l);
+
+				float largest_axis_v = 0.0f;
+				int largest_axis_index = -1;
+				for (uint32_t i = 0; i < TrainingVectorType::num_elements; i++)
+				{
+					if (r[i] > largest_axis_v)
+					{
+						largest_axis_v = r[i];
+						largest_axis_index = i;
+					}
+				}
+
+				if (largest_axis_index < 0)
+					return false;
+
+				std::vector<float> keys(node.m_training_vecs.size());
+				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+					keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];
+
+				uint_vec indices(node.m_training_vecs.size());
+				indirect_sort((uint32_t)node.m_training_vecs.size(), &indices[0], &keys[0]);
+
+				l_child.set_zero();
+				l_weight = 0;
+
+				r_child.set_zero();
+				r_weight = 0;
+
+				const uint32_t half_index = (uint32_t)node.m_training_vecs.size() / 2;
+				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+				{
+					const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
+
+					const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
+
+					if (i < half_index)
+					{
+						l_child += v * weight;
+						l_weight += weight;
+					}
+					else
+					{
+						r_child += v * weight;
+						r_weight += weight;
+					}
+				}
+
+				if ((l_weight > 0.0f) && (r_weight > 0.0f))
+				{
+					l_child_result = l_child * static_cast<float>(1.0f / l_weight);
+					r_child_result = r_child * static_cast<float>(1.0f / r_weight);
+				}
+				else
+				{
+					l_child_result = l;
+					r_child_result = h;
 				}
 			}
+
+			return true;
 		}
 
 		bool refine_split(const tsvq_node &node,
@@ -1191,9 +1377,9 @@
 				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
 				{
 					const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
-					const uint32_t weight = m_training_vecs[node.m_training_vecs[i]].second;
+					const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
 
-					double left_dist2 = l_child.squared_distance(v), right_dist2 = r_child.squared_distance(v);
+					double left_dist2 = l_child.squared_distance_d(v), right_dist2 = r_child.squared_distance_d(v);
 
 					if (left_dist2 >= right_dist2)
 					{
@@ -1214,7 +1400,36 @@
 				}
 
 				if ((!l_weight) || (!r_weight))
-					return false;
+				{
+					TrainingVectorType firstVec;
+					for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+					{
+						const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
+						const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
+					
+						if ((!i) || (v == firstVec))
+						{
+							firstVec = v;
+
+							new_r_child += (v * static_cast<float>(weight));
+							r_weight += weight;
+
+							r_ttsum += weight * v.dot(v);
+							r_children.push_back(node.m_training_vecs[i]);
+						}
+						else
+						{
+							new_l_child += (v * static_cast<float>(weight));
+							l_weight += weight;
+
+							l_ttsum += weight * v.dot(v);
+							l_children.push_back(node.m_training_vecs[i]);
+						}
+					}
+
+					if (!l_weight)
+						return false;
+				}
 
 				l_var = static_cast<float>(l_ttsum - (new_l_child.dot(new_l_child) / l_weight));
 				r_var = static_cast<float>(r_ttsum - (new_r_child.dot(new_r_child) / r_weight));
@@ -1242,6 +1457,238 @@
 		}
 	};
 
+	struct weighted_block_group
+	{
+		uint64_t m_total_weight;
+		uint_vec m_indices;
+	};
+
+	template<typename Quantizer>
+	bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,
+		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
+		std::vector<uint_vec>& codebook,
+		std::vector<uint_vec>& parent_codebook,
+		uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)
+	{
+		codebook.resize(0);
+		parent_codebook.resize(0);
+
+		if ((max_threads <= 1) || (q.get_training_vecs().size() < 256) || (max_codebook_size < max_threads * 16))
+		{
+			if (!q.generate(max_codebook_size))
+				return false;
+
+			q.retrieve(codebook);
+
+			if (max_parent_codebook_size)
+				q.retrieve(max_parent_codebook_size, parent_codebook);
+
+			return true;
+		}
+
+		const uint32_t cMaxThreads = 16;
+		if (max_threads > cMaxThreads)
+			max_threads = cMaxThreads;
+
+		if (!q.generate(max_threads))
+			return false;
+
+		std::vector<uint_vec> initial_codebook;
+
+		q.retrieve(initial_codebook);
+
+		if (initial_codebook.size() < max_threads)
+		{
+			codebook = initial_codebook;
+
+			if (max_parent_codebook_size)
+				q.retrieve(max_parent_codebook_size, parent_codebook);
+
+			return true;
+		}
+
+		Quantizer quantizers[cMaxThreads];
+		
+		bool success_flags[cMaxThreads];
+		clear_obj(success_flags);
+
+		std::vector<uint_vec> local_clusters[cMaxThreads];
+		std::vector<uint_vec> local_parent_clusters[cMaxThreads];
+
+		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
+		{
+			pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {
+
+				Quantizer& lq = quantizers[thread_iter];
+				uint_vec& cluster_indices = initial_codebook[thread_iter];
+
+				uint_vec local_to_global(cluster_indices.size());
+
+				for (uint32_t i = 0; i < cluster_indices.size(); i++)
+				{
+					const uint32_t global_training_vec_index = cluster_indices[i];
+					local_to_global[i] = global_training_vec_index;
+
+					lq.add_training_vec(q.get_training_vecs()[global_training_vec_index].first, q.get_training_vecs()[global_training_vec_index].second);
+				}
+
+				const uint32_t max_clusters = limit_clusterizers ? ((max_codebook_size + max_threads - 1) / max_threads) : (uint32_t)lq.get_total_training_vecs();
+
+				success_flags[thread_iter] = lq.generate(max_clusters);
+
+				if (success_flags[thread_iter])
+				{
+					lq.retrieve(local_clusters[thread_iter]);
+
+					for (uint32_t i = 0; i < local_clusters[thread_iter].size(); i++)
+					{
+						for (uint32_t j = 0; j < local_clusters[thread_iter][i].size(); j++)
+							local_clusters[thread_iter][i][j] = local_to_global[local_clusters[thread_iter][i][j]];
+					}
+
+					if (max_parent_codebook_size)
+					{
+						lq.retrieve((max_parent_codebook_size + max_threads - 1) / max_threads, local_parent_clusters[thread_iter]);
+
+						for (uint32_t i = 0; i < local_parent_clusters[thread_iter].size(); i++)
+						{
+							for (uint32_t j = 0; j < local_parent_clusters[thread_iter][i].size(); j++)
+								local_parent_clusters[thread_iter][i][j] = local_to_global[local_parent_clusters[thread_iter][i][j]];
+						}
+					}
+				}
+
+			} );
+
+		} // thread_iter
+
+		pJob_pool->wait_for_all();
+
+		uint32_t total_clusters = 0, total_parent_clusters = 0;
+
+		for (int thread_iter = 0; thread_iter < (int)max_threads; thread_iter++)
+		{
+			if (!success_flags[thread_iter])
+				return false;
+			total_clusters += (uint32_t)local_clusters[thread_iter].size();
+			total_parent_clusters += (uint32_t)local_parent_clusters[thread_iter].size();
+		}
+
+		codebook.reserve(total_clusters);
+		parent_codebook.reserve(total_parent_clusters);
+
+		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
+		{
+			for (uint32_t j = 0; j < local_clusters[thread_iter].size(); j++)
+			{
+				codebook.resize(codebook.size() + 1);
+				codebook.back().swap(local_clusters[thread_iter][j]);
+			}
+
+			for (uint32_t j = 0; j < local_parent_clusters[thread_iter].size(); j++)
+			{
+				parent_codebook.resize(parent_codebook.size() + 1);
+				parent_codebook.back().swap(local_parent_clusters[thread_iter][j]);
+			}
+		}
+
+		return true;
+	}
+
+	template<typename Quantizer>
+	bool generate_hierarchical_codebook_threaded(Quantizer& q,
+		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
+		std::vector<uint_vec>& codebook,
+		std::vector<uint_vec>& parent_codebook,
+		uint32_t max_threads, job_pool *pJob_pool)
+	{
+		typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
+		typedef std::unordered_map < typename Quantizer::training_vec_type, weighted_block_group, 
+			training_vec_bit_hasher> group_hash;
+		
+		group_hash unique_vecs;
+
+		weighted_block_group g;
+		g.m_indices.resize(1);
+
+		for (uint32_t i = 0; i < q.get_training_vecs().size(); i++)
+		{
+			g.m_total_weight = q.get_training_vecs()[i].second;
+			g.m_indices[0] = i;
+
+			auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
+
+			if (!ins_res.second)
+			{
+				(ins_res.first)->second.m_total_weight += g.m_total_weight;
+				(ins_res.first)->second.m_indices.push_back(i);
+			}
+		}
+
+		debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size());
+
+		Quantizer group_quant;
+		typedef typename group_hash::const_iterator group_hash_const_iter;
+		std::vector<group_hash_const_iter> unique_vec_iters;
+		unique_vec_iters.reserve(unique_vecs.size());
+
+		for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)
+		{
+			group_quant.add_training_vec(iter->first, iter->second.m_total_weight);
+			unique_vec_iters.push_back(iter);
+		}
+
+		bool limit_clusterizers = true;
+		if (unique_vecs.size() <= max_codebook_size)
+			limit_clusterizers = false;
+
+		debug_printf("Limit clusterizers: %u\n", limit_clusterizers);
+
+		std::vector<uint_vec> group_codebook, group_parent_codebook;
+		bool status = generate_hierarchical_codebook_threaded_internal(group_quant,
+			max_codebook_size, max_parent_codebook_size,
+			group_codebook,
+			group_parent_codebook,
+			(unique_vecs.size() < 65536*4) ? 1 : max_threads, limit_clusterizers, pJob_pool);
+
+		if (!status)
+			return false;
+
+		codebook.resize(0);
+		for (uint32_t i = 0; i < group_codebook.size(); i++)
+		{
+			codebook.resize(codebook.size() + 1);
+
+			for (uint32_t j = 0; j < group_codebook[i].size(); j++)
+			{
+				const uint32_t group_index = group_codebook[i][j];
+
+				typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
+				const uint_vec& training_vec_indices = group_iter->second.m_indices;
+				
+				append_vector(codebook.back(), training_vec_indices);
+			}
+		}
+
+		parent_codebook.resize(0);
+		for (uint32_t i = 0; i < group_parent_codebook.size(); i++)
+		{
+			parent_codebook.resize(parent_codebook.size() + 1);
+
+			for (uint32_t j = 0; j < group_parent_codebook[i].size(); j++)
+			{
+				const uint32_t group_index = group_parent_codebook[i][j];
+
+				typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
+				const uint_vec& training_vec_indices = group_iter->second.m_indices;
+
+				append_vector(parent_codebook.back(), training_vec_indices);
+			}
+		}
+
+		return true;
+	}
+
 	// Canonical Huffman coding
 
 	class histogram
@@ -2353,7 +2800,7 @@
 	}
 
 	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);
-		
+
 } // namespace basisu
 
 
diff --git a/basisu_etc.cpp b/basisu_etc.cpp
index 5d6be63..24f233b 100644
--- a/basisu_etc.cpp
+++ b/basisu_etc.cpp
@@ -1250,11 +1250,10 @@
 
 	// Packs solid color blocks efficiently using a set of small precomputed tables.
 	// For random 888 inputs, MSE results are better than Erricson's ETC1 packer in "slow" mode ~9.5% of the time, is slightly worse only ~.01% of the time, and is equal the rest of the time.
-	static uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor, basis_etc1_pack_params& pack_params, pack_etc1_block_context& context)
+	uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor, bool diff_only)
 	{
 		assert(g_etc1_inverse_lookup[0][255]);
 
-		context, pack_params;
 		static uint32_t s_next_comp[4] = { 1, 2, 0, 1 };
 
 		uint32_t best_error = UINT32_MAX, best_i = 0;
@@ -1281,6 +1280,10 @@
 				do
 				{
 					const uint32_t x = *pTable++;
+					
+					const bool diff = (x & 1) != 0;
+					if ((diff_only) && (!diff))
+						continue;
 
 #ifdef BASISU_BUILD_DEBUG
 					const uint32_t diff = x & 1;
@@ -1309,6 +1312,9 @@
 		}
 	found_perfect_match:
 
+		if (best_error == UINT32_MAX)
+			return UINT64_MAX;
+
 		const uint32_t diff = best_x & 1;
 		const uint32_t inten = (best_x >> 1) & 7;
 
@@ -1338,14 +1344,11 @@
 	static uint32_t pack_etc1_block_solid_color_constrained(
 		etc1_optimizer::results& results,
 		uint32_t num_colors, const uint8_t *pColor,
-		basis_etc1_pack_params& pack_params,
-		pack_etc1_block_context& context,
 		bool use_diff,
 		const color_rgba* pBase_color5_unscaled)
 	{
 		assert(g_etc1_inverse_lookup[0][255]);
 
-		context, pack_params;
 		static uint32_t s_next_comp[4] = { 1, 2, 0, 1 };
 
 		uint32_t best_error = UINT32_MAX, best_i = 0;
@@ -1587,7 +1590,7 @@
 
 		return upper_lower_sum < left_right_sum;
 	}
-
+		
 	uint64_t pack_etc1_block(etc_block& dst_block, const color_rgba* pSrc_pixels, basis_etc1_pack_params& pack_params, pack_etc1_block_context& context, const uint8_t* pForce_selectors)
 	{
 #if BASISU_DEBUG_ETC_ENCODER
@@ -1607,7 +1610,7 @@
 #if BASISU_DEBUG_ETC_ENCODER
 				printf("** Block is a single solid color\n");
 #endif
-				uint64_t err = 16 * pack_etc1_block_solid_color(dst_block, &pSrc_pixels[0].r, pack_params, context);
+				uint64_t err = 16 * pack_etc1_block_solid_color(dst_block, &pSrc_pixels[0].r);
 				dst_block.set_flip_bit(true);
 				return err;
 			}
@@ -1741,7 +1744,7 @@
 									break;
 							if (!r)
 							{
-								pack_etc1_block_solid_color_constrained(results[2], 8, &subblock_pixel0.r, pack_params, context, !use_color4, (subblock && !use_color4) ? &results[0].m_block_color_unscaled : nullptr);
+								pack_etc1_block_solid_color_constrained(results[2], 8, &subblock_pixel0.r, !use_color4, (subblock && !use_color4) ? &results[0].m_block_color_unscaled : nullptr);
 							}
 						}
 
@@ -1961,5 +1964,6 @@
 
 		return best_error;
 	}
-
+	
+			
 } // namespace basisu
diff --git a/basisu_etc.h b/basisu_etc.h
index 398dcfa..da7b5f0 100644
--- a/basisu_etc.h
+++ b/basisu_etc.h
@@ -558,6 +558,21 @@
 		uint64_t evaluate_etc1_error(const color_rgba* pBlock_pixels, bool perceptual, int subblock_index = -1) const;
 		void get_subblock_pixels(color_rgba* pPixels, int subblock_index = -1) const;
 
+		void get_selector_range(uint32_t& low, uint32_t& high) const
+		{
+			low = 3;
+			high = 0;
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t s = get_selector(x, y);
+					low = minimum(low, s);
+					high = maximum(high, s);
+				}
+			}
+		}
+
 		void set_block_color4(const color_rgba &c0_unscaled, const color_rgba &c1_unscaled)
 		{
 			set_diff_bit(false);
@@ -1034,6 +1049,6 @@
 
 	bool pack_etc1_estimate_flipped(const color_rgba* pSrc_pixels);
 		
-	uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor);
+	uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor, bool diff_only = true);
 
 } // namespace basisu
diff --git a/basisu_frontend.cpp b/basisu_frontend.cpp
index 168f58d..32d6822 100644
--- a/basisu_frontend.cpp
+++ b/basisu_frontend.cpp
@@ -22,28 +22,58 @@
 #include <unordered_set>
 #include <unordered_map>
 
-#define BASISU_FRONTEND_VERIFY(c) verify(c, __LINE__);
+#define BASISU_FRONTEND_VERIFY(c) do { if (!(c)) handle_verify_failure(__LINE__); } while(0)
 
 namespace basisu
 {
+	const uint32_t cMaxCodebookCreationThreads = 8;
+
 	const uint32_t BASISU_MAX_ENDPOINT_REFINEMENT_STEPS = 3;
 	const uint32_t BASISU_MAX_SELECTOR_REFINEMENT_STEPS = 3;
 
 	const uint32_t BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE = 16;
 	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE = 16;
-
+	
 	// TODO - How to handle internal verifies in the basisu lib
-	static inline void verify(bool condition, int line)
+	static inline void handle_verify_failure(int line)
 	{
-		if (!condition)
-		{
 			fprintf(stderr, "ERROR: basisu_frontend: verify check failed at line %i!\n", line);
 			abort();
-		}
 	}
 			
 	bool basisu_frontend::init(const params &p)
 	{
+#if 0
+		// HACK HACK
+		FILE* pFile;
+		fopen_s(&pFile, "tv.bin", "rb");
+		if (pFile)
+		{
+			debug_printf("Using tv.bin\n");
+
+			fseek(pFile, 0, SEEK_END);
+			uint32_t size = ftell(pFile);
+			fseek(pFile, 0, SEEK_SET);
+
+			uint32_t tv = size / sizeof(vec6F_quantizer::training_vec_with_weight);
+
+			std::vector<vec6F_quantizer::training_vec_with_weight> v(tv);
+			fread(&v[0], 1, sizeof(v[0]) * tv, pFile);
+
+			for (uint32_t i = 0; i < tv; i++)
+				m_endpoint_clusterizer.add_training_vec(v[i].first, v[i].second);
+
+			m_endpoint_clusterizer.generate(16128);
+			std::vector<uint_vec> codebook;
+			m_endpoint_clusterizer.retrieve(codebook);
+
+			printf("Generated %u entries\n", (uint32_t)codebook.size());
+
+			fclose(pFile);
+			exit(0);
+		}
+#endif
+
 		if (p.m_use_hybrid_selector_codebooks)
 		{
 			if (!p.m_pGlobal_sel_codebook)
@@ -53,8 +83,8 @@
 			}
 		}
 
-		debug_printf("basisu_frontend::init: NumEndpointClusters: %u, NumSelectorClusters: %u, Perceptual: %u, CompressionLevel: %u\n",
-			p.m_max_endpoint_clusters, p.m_max_selector_clusters, p.m_perceptual, p.m_compression_level);
+		debug_printf("basisu_frontend::init: Multithreaded: %u, NumEndpointClusters: %u, NumSelectorClusters: %u, Perceptual: %u, CompressionLevel: %u\n",
+			p.m_multithreaded, p.m_max_endpoint_clusters, p.m_max_selector_clusters, p.m_perceptual, p.m_compression_level);
 
 		debug_printf("Global sel codebook pal bits: %u, Global sel codebook mod bits: %u, Use hybrid selector codebook: %u, Hybrid codebook quality thresh: %f\n",
 			p.m_num_global_sel_codebook_pal_bits,
@@ -92,6 +122,7 @@
 			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = true;
 			m_use_hierarchical_selector_codebooks = true;
+
 			break;
 		}
 		case 2:
@@ -131,6 +162,9 @@
 
 		}
 
+		if (m_params.m_disable_hierarchical_endpoint_codebooks)
+			m_use_hierarchical_endpoint_codebooks = false;
+
 		debug_printf("Endpoint refinement: %u, Hierarchical endpoint codebooks: %u, Hierarchical selector codebooks: %u, Endpoint codebook iters: %u, Selector codebook iters: %u\n", 
 			m_endpoint_refinement, m_use_hierarchical_endpoint_codebooks, m_use_hierarchical_selector_codebooks, m_num_endpoint_codebook_iterations, m_num_selector_codebook_iterations);
 
@@ -164,12 +198,8 @@
 			if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
 			{
 				char buf[256];
-#ifdef _WIN32				
-				sprintf_s(buf, sizeof(buf), "endpoint_cluster_vis_pre_%u.png", refine_endpoint_step);
-#else
 				snprintf(buf, sizeof(buf), "endpoint_cluster_vis_pre_%u.png", refine_endpoint_step);
-#endif				
-				dump_endpoint_clusterization_visualization(buf);
+				dump_endpoint_clusterization_visualization(buf, false);
 			}
 
 			bool early_out = false;
@@ -181,18 +211,24 @@
 				if (!refine_endpoint_clusterization())
 					early_out = true;
 
+				if ((m_params.m_tex_type == basist::cBASISTexTypeVideoFrames) && (!refine_endpoint_step) && (m_num_endpoint_codebook_iterations == 1))
+				{
+					eliminate_redundant_or_empty_endpoint_clusters();
+					generate_endpoint_codebook(refine_endpoint_step);
+				}
+
 				if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
 				{
 					char buf[256];
-#ifdef _WIN32					
-					sprintf_s(buf, sizeof(buf), "endpoint_cluster_vis_post_%u.png", refine_endpoint_step);
-#else
 					snprintf(buf, sizeof(buf), "endpoint_cluster_vis_post_%u.png", refine_endpoint_step);
-#endif					
-					dump_endpoint_clusterization_visualization(buf);
+
+					dump_endpoint_clusterization_visualization(buf, false);
+					snprintf(buf, sizeof(buf), "endpoint_cluster_colors_vis_post_%u.png", refine_endpoint_step);
+
+					dump_endpoint_clusterization_visualization(buf, true);
 				}
 			}
-			
+						
 			eliminate_redundant_or_empty_endpoint_clusters();
 
 			if (m_params.m_debug_stats)
@@ -208,7 +244,7 @@
 
 		create_initial_packed_texture();
 
-		create_selector_clusters();
+		generate_selector_clusters();
 
 		if (m_use_hierarchical_selector_codebooks)
 			compute_selector_clusters_within_each_parent_cluster();
@@ -218,6 +254,8 @@
 			create_optimized_selector_codebook(0);
 
 			find_optimal_selector_clusters_for_each_block();
+			
+			introduce_special_selector_clusters();
 		}
 		else
 		{
@@ -228,7 +266,9 @@
 
 				find_optimal_selector_clusters_for_each_block();
 
-				if (m_params.m_compression_level >= 3)
+				introduce_special_selector_clusters();
+				
+				if ((m_params.m_compression_level >= 3) || (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
 				{
 					if (!refine_block_endpoints_given_selectors())
 						break;
@@ -254,6 +294,110 @@
 		return true;
 	}
 
+	void basisu_frontend::introduce_special_selector_clusters()
+	{
+		debug_printf("introduce_special_selector_clusters\n");
+
+		if (m_params.m_pGlobal_sel_codebook)
+			return;
+
+		uint32_t total_blocks_relocated = 0;
+		const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_indices.size();
+
+		bool_vec block_relocated_flags(m_total_blocks);
+
+		// Make sure the selector codebook always has pure flat blocks for each possible selector, to avoid obvious artifacts.
+		// optimize_selector_codebook() will clean up any redundant clusters we create here.
+		for (uint32_t sel = 0; sel < 4; sel++)
+		{
+			etc_block blk;
+			clear_obj(blk);
+			for (uint32_t j = 0; j < 16; j++)
+				blk.set_selector(j & 3, j >> 2, sel);
+
+			int k;
+			for (k = 0; k < (int)m_optimized_cluster_selectors.size(); k++)
+				if (m_optimized_cluster_selectors[k].get_raw_selector_bits() == blk.get_raw_selector_bits())
+					break;
+			if (k < (int)m_optimized_cluster_selectors.size())
+				continue;
+
+			debug_printf("Introducing sel %u\n", sel);
+
+			const uint32_t new_selector_cluster_index = (uint32_t)m_optimized_cluster_selectors.size();
+
+			m_optimized_cluster_selectors.push_back(blk);
+			
+			vector_ensure_element_is_valid(m_selector_cluster_indices, new_selector_cluster_index);
+			
+			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+			{
+				if (m_orig_encoded_blocks[block_index].get_raw_selector_bits() != blk.get_raw_selector_bits())
+					continue;
+
+				// See if using flat selectors actually decreases the block's error.
+				const uint32_t old_selector_cluster_index = m_block_selector_cluster_index[block_index];
+				
+				etc_block cur_blk;
+				const uint32_t endpoint_cluster_index = get_subblock_endpoint_cluster_index(block_index, 0);
+				cur_blk.set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(endpoint_cluster_index, false));
+				cur_blk.set_inten_tables_etc1s(get_endpoint_cluster_inten_table(endpoint_cluster_index, false));
+				cur_blk.set_raw_selector_bits(get_selector_cluster_selector_bits(old_selector_cluster_index).get_raw_selector_bits());
+
+				const uint64_t cur_err = cur_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);
+
+				cur_blk.set_raw_selector_bits(blk.get_raw_selector_bits());
+
+				const uint64_t new_err = cur_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);
+
+				if (new_err >= cur_err)
+					continue;
+				
+				// Change the block to use the new cluster
+				m_block_selector_cluster_index[block_index] = new_selector_cluster_index;
+				
+				m_selector_cluster_indices[new_selector_cluster_index].push_back(block_index);
+
+				block_relocated_flags[block_index] = true;
+
+#if 0
+				int j = vector_find(m_selector_cluster_indices[old_selector_cluster_index], block_index);
+				if (j >= 0)
+					m_selector_cluster_indices[old_selector_cluster_index].erase(m_selector_cluster_indices[old_selector_cluster_index].begin() + j);
+#endif
+
+				total_blocks_relocated++;
+
+				m_encoded_blocks[block_index].set_raw_selector_bits(blk.get_raw_selector_bits());
+
+			} // block_index
+
+		} // sel
+
+		if (total_blocks_relocated)
+		{
+			debug_printf("Fixing selector codebook\n");
+
+			for (int selector_cluster_index = 0; selector_cluster_index < (int)initial_selector_clusters; selector_cluster_index++)
+			{
+				uint_vec& block_indices = m_selector_cluster_indices[selector_cluster_index];
+
+				uint32_t dst_ofs = 0;
+
+				for (uint32_t i = 0; i < block_indices.size(); i++)
+				{
+					const uint32_t block_index = block_indices[i];
+					if (!block_relocated_flags[block_index])
+						block_indices[dst_ofs++] = block_index;
+				}
+
+				block_indices.resize(dst_ofs);
+			}
+		}
+
+		debug_printf("Total blocks relocated to new flat selector clusters: %u\n", total_blocks_relocated);
+	}
+
 	void basisu_frontend::optimize_selector_codebook()
 	{
 		debug_printf("optimize_selector_codebook\n");
@@ -331,80 +475,118 @@
 				
 		m_etc1_blocks_etc1s.resize(m_total_blocks);
 
-#pragma omp parallel for
-		for (int block_index = 0; block_index < static_cast<int>(m_total_blocks); block_index++)
+		const uint32_t N = 4096;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 		{
-			const pixel_block &source_blk = get_source_pixel_block(block_index);
+			const uint32_t first_index = block_index_iter;                                        
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);       
+                                                                                      
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
 
-			etc1_optimizer optimizer;
-			etc1_optimizer::params optimizer_params;
-			etc1_optimizer::results optimizer_results;
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++) 
+				{
+					const pixel_block &source_blk = get_source_pixel_block(block_index);
+
+					etc1_optimizer optimizer;
+					etc1_optimizer::params optimizer_params;
+					etc1_optimizer::results optimizer_results;
 			
-			if (m_params.m_compression_level == 0)
-				optimizer_params.m_quality = cETCQualityFast;
-			else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
-				optimizer_params.m_quality = cETCQualityUber;
+					if (m_params.m_compression_level == 0)
+						optimizer_params.m_quality = cETCQualityFast;
+					else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+						optimizer_params.m_quality = cETCQualityUber;
 						
-			optimizer_params.m_num_src_pixels = 16;
-			optimizer_params.m_pSrc_pixels = source_blk.get_ptr();
-			optimizer_params.m_perceptual = m_params.m_perceptual;
+					optimizer_params.m_num_src_pixels = 16;
+					optimizer_params.m_pSrc_pixels = source_blk.get_ptr();
+					optimizer_params.m_perceptual = m_params.m_perceptual;
 
-			uint8_t selectors[16];
-			optimizer_results.m_pSelectors = selectors;
-			optimizer_results.m_n = 16;
+					uint8_t selectors[16];
+					optimizer_results.m_pSelectors = selectors;
+					optimizer_results.m_n = 16;
 
-			optimizer.init(optimizer_params, optimizer_results);
-			optimizer.compute();
+					optimizer.init(optimizer_params, optimizer_results);
+					optimizer.compute();
 			
-			etc_block &blk = m_etc1_blocks_etc1s[block_index];
+					etc_block &blk = m_etc1_blocks_etc1s[block_index];
 
-			memset(&blk, 0, sizeof(blk));
-			blk.set_block_color5_etc1s(optimizer_results.m_block_color_unscaled);
-			blk.set_inten_tables_etc1s(optimizer_results.m_block_inten_table);
-			blk.set_flip_bit(true);
+					memset(&blk, 0, sizeof(blk));
+					blk.set_block_color5_etc1s(optimizer_results.m_block_color_unscaled);
+					blk.set_inten_tables_etc1s(optimizer_results.m_block_inten_table);
+					blk.set_flip_bit(true);
 
-			for (uint32_t y = 0; y < 4; y++)
-				for (uint32_t x = 0; x < 4; x++)
-					blk.set_selector(x, y, selectors[x + y * 4]);
+					for (uint32_t y = 0; y < 4; y++)
+						for (uint32_t x = 0; x < 4; x++)
+							blk.set_selector(x, y, selectors[x + y * 4]);
+				}
+
+			} );
 		}
+		                                     
+		m_params.m_pJob_pool->wait_for_all();
 	}
 
 	void basisu_frontend::init_endpoint_training_vectors()
 	{
 		debug_printf("init_endpoint_training_vectors\n");
 								
-		for (int block_index = 0; block_index < (int)m_total_blocks; block_index++)
+		vec6F_quantizer::array_of_weighted_training_vecs &training_vecs = m_endpoint_clusterizer.get_training_vecs();
+		
+		training_vecs.resize(m_total_blocks * 2);
+
+		const uint32_t N = 16384;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 		{
-			const etc_block &blk = m_etc1_blocks_etc1s[block_index];
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-			color_rgba block_colors[2];
-			blk.get_block_low_high_colors(block_colors, 0);
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {
 
-			vec6F v;
-			v[0] = block_colors[0].r * (1.0f / 255.0f);
-			v[1] = block_colors[0].g * (1.0f / 255.0f);
-			v[2] = block_colors[0].b * (1.0f / 255.0f);
-			v[3] = block_colors[1].r * (1.0f / 255.0f);
-			v[4] = block_colors[1].g * (1.0f / 255.0f);
-			v[5] = block_colors[1].b * (1.0f / 255.0f);
-			
-			m_endpoint_clusterizer.add_training_vec(v, 1);
-			m_endpoint_clusterizer.add_training_vec(v, 1);
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{			
+					const etc_block &blk = m_etc1_blocks_etc1s[block_index];
 
-		} // block_index
+					color_rgba block_colors[2];
+					blk.get_block_low_high_colors(block_colors, 0);
+				
+					vec6F v;
+					v[0] = block_colors[0].r * (1.0f / 255.0f);
+					v[1] = block_colors[0].g * (1.0f / 255.0f);
+					v[2] = block_colors[0].b * (1.0f / 255.0f);
+					v[3] = block_colors[1].r * (1.0f / 255.0f);
+					v[4] = block_colors[1].g * (1.0f / 255.0f);
+					v[5] = block_colors[1].b * (1.0f / 255.0f);
+				
+					training_vecs[block_index * 2 + 0] = std::make_pair(v, 1);
+					training_vecs[block_index * 2 + 1] = std::make_pair(v, 1);
+
+				} // block_index;
+
+			} );
+
+		} // block_index_iter
+
+		m_params.m_pJob_pool->wait_for_all();
 	}
 
 	void basisu_frontend::generate_endpoint_clusters()
 	{
 		debug_printf("Begin endpoint quantization\n");
 
-		m_endpoint_clusterizer.generate(m_params.m_max_endpoint_clusters);
+		const uint32_t parent_codebook_size = (m_params.m_max_endpoint_clusters >= 256) ? BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE : 0;
+		uint32_t max_threads = 0;
+		max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;
 
-		m_endpoint_clusterizer.retrieve(m_endpoint_clusters);
+		debug_printf("Using %u threads to create codebook\n", max_threads);
+		bool status = generate_hierarchical_codebook_threaded(m_endpoint_clusterizer,
+			m_params.m_max_endpoint_clusters, m_use_hierarchical_endpoint_codebooks ? parent_codebook_size : 0,
+			m_endpoint_clusters,
+			m_endpoint_parent_clusters,
+			max_threads, m_params.m_pJob_pool);
+		BASISU_FRONTEND_VERIFY(status);
 
 		if (m_use_hierarchical_endpoint_codebooks)
 		{
-			if (m_endpoint_clusters.size() < BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE)
+			if (!m_endpoint_parent_clusters.size())
 			{
 				m_endpoint_parent_clusters.resize(0);
 				m_endpoint_parent_clusters.resize(1);
@@ -414,13 +596,12 @@
 					m_endpoint_parent_clusters[0].push_back(i*2+1);
 				}
 			}
-			else
-				m_endpoint_clusterizer.retrieve(BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE, m_endpoint_parent_clusters);
 
 			BASISU_ASSUME(BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE <= UINT8_MAX);
 
 			m_block_parent_endpoint_cluster.resize(0);
 			m_block_parent_endpoint_cluster.resize(m_total_blocks);
+			vector_set_all(m_block_parent_endpoint_cluster, 0xFF);
 			for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_endpoint_parent_clusters.size(); parent_cluster_index++)
 			{
 				const uint_vec &cluster = m_endpoint_parent_clusters[parent_cluster_index];
@@ -431,6 +612,11 @@
 				}
 			}
 
+			for (uint32_t i = 0; i < m_total_blocks; i++)
+			{
+				BASISU_FRONTEND_VERIFY(m_block_parent_endpoint_cluster[i] != 0xFF);
+			}
+
 			// Ensure that all the blocks within each cluster are all in the same parent cluster, or something is very wrong.
 			for (uint32_t cluster_index = 0; cluster_index < m_endpoint_clusters.size(); cluster_index++)
 			{
@@ -514,71 +700,84 @@
 	{
 		m_subblock_endpoint_quant_err_vec.resize(0);
 
-#pragma omp parallel for
-		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
+		const uint32_t N = 512;
+		for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+			const uint32_t first_index = cluster_index_iter;                                    
+			const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);   
 
-			assert(cluster_indices.size());
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
 
-			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
-			{
-				std::vector<color_rgba> cluster_pixels(8);
-
-				const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
-				const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
-
-				const bool flipped = true;
-
-				const color_rgba *pSource_block_pixels = get_source_pixel_block(block_index).get_ptr();
-
-				for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)
+				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
-					cluster_pixels[pixel_index] = pSource_block_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];
-				}
+					const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
-				const endpoint_cluster_etc_params &etc_params = m_endpoint_cluster_etc_params[cluster_index];
+					assert(cluster_indices.size());
 
-				assert(etc_params.m_valid);
-																				
-				color_rgba block_colors[4];
-				etc_block::get_block_colors5(block_colors, etc_params.m_color_unscaled[0], etc_params.m_inten_table[0], true);
-
-				uint64_t total_err = 0;
-
-				for (uint32_t i = 0; i < 8; i++)
-				{
-					const color_rgba &c = cluster_pixels[i];
-
-					uint64_t best_err = UINT64_MAX;
-					uint32_t best_index = 0;
-
-					for (uint32_t s = 0; s < 4; s++)
+					for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 					{
-						uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);
-						if (err < best_err)
+						std::vector<color_rgba> cluster_pixels(8);
+
+						const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+						const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+
+						const bool flipped = true;
+
+						const color_rgba *pSource_block_pixels = get_source_pixel_block(block_index).get_ptr();
+
+						for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)
 						{
-							best_err = err;
-							best_index = s;
+							cluster_pixels[pixel_index] = pSource_block_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];
+						}
+
+						const endpoint_cluster_etc_params &etc_params = m_endpoint_cluster_etc_params[cluster_index];
+
+						assert(etc_params.m_valid);
+																				
+						color_rgba block_colors[4];
+						etc_block::get_block_colors5(block_colors, etc_params.m_color_unscaled[0], etc_params.m_inten_table[0], true);
+
+						uint64_t total_err = 0;
+
+						for (uint32_t i = 0; i < 8; i++)
+						{
+							const color_rgba &c = cluster_pixels[i];
+
+							uint64_t best_err = UINT64_MAX;
+							//uint32_t best_index = 0;
+
+							for (uint32_t s = 0; s < 4; s++)
+							{
+								uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);
+								if (err < best_err)
+								{
+									best_err = err;
+									//best_index = s;
+								}
+							}
+
+							total_err += best_err;
+						}
+
+						subblock_endpoint_quant_err quant_err;
+						quant_err.m_total_err = total_err;
+						quant_err.m_cluster_index = cluster_index;
+						quant_err.m_cluster_subblock_index = cluster_indices_iter;
+						quant_err.m_block_index = block_index;
+						quant_err.m_subblock_index = subblock_index;
+					
+						{
+							std::lock_guard<std::mutex> lock(m_lock);
+
+							m_subblock_endpoint_quant_err_vec.push_back(quant_err);
 						}
 					}
+				} // cluster_index
 
-					total_err += best_err;
-				}
+			} );
+		} // cluster_index_iter
 
-				subblock_endpoint_quant_err quant_err;
-				quant_err.m_total_err = total_err;
-				quant_err.m_cluster_index = cluster_index;
-				quant_err.m_cluster_subblock_index = cluster_indices_iter;
-				quant_err.m_block_index = block_index;
-				quant_err.m_subblock_index = subblock_index;
-
-#pragma omp critical
-				{
-					m_subblock_endpoint_quant_err_vec.push_back(quant_err);
-				}
-			}
-		}
+		m_params.m_pJob_pool->wait_for_all();
 
 		vector_sort(m_subblock_endpoint_quant_err_vec);
 	}
@@ -628,6 +827,15 @@
 			if (unordered_set_contains(training_vector_was_relocated, training_vector_index ^ 1))
 				continue;
 
+#if 0
+			const uint32_t block_index = subblock_to_move.m_block_index;
+			const etc_block& blk = m_etc1_blocks_etc1s[block_index];
+			uint32_t ls, hs;
+			blk.get_selector_range(ls, hs);
+			if (ls != hs)
+				continue;
+#endif
+
 			const uint32_t new_endpoint_cluster_index = (uint32_t)m_endpoint_clusters.size();
 
 			enlarge_vector(m_endpoint_clusters, 1)->push_back(training_vector_index);
@@ -679,123 +887,135 @@
 
 		m_endpoint_cluster_etc_params.resize(m_endpoint_clusters.size());
 
-#pragma omp parallel for
-		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
+		const uint32_t N = 128;
+		for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)
 		{
-			//debug_printf("%u of %u\n", cluster_index, clusters.size());
+			const uint32_t first_index = cluster_index_iter;                                    
+			const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);   
+			
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, step ] {
 
-			const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
-
-			BASISU_FRONTEND_VERIFY(cluster_indices.size());
-
-			const uint32_t total_pixels = (uint32_t)cluster_indices.size() * 8;
-
-			std::vector<color_rgba> cluster_pixels(total_pixels);
-
-			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
-			{
-				const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
-				const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
-
-				const bool flipped = true;
-
-				const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
-
-				for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)
+				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
-					cluster_pixels[cluster_indices_iter * 8 + pixel_index] = pBlock_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];
-				}
-			}
+					const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
-			endpoint_cluster_etc_params new_subblock_params;
-						
-			{
-				etc1_optimizer optimizer;
-				etc1_solution_coordinates solutions[2];
+					BASISU_FRONTEND_VERIFY(cluster_indices.size());
 
-				etc1_optimizer::params cluster_optimizer_params;
-				cluster_optimizer_params.m_num_src_pixels = total_pixels;
-				cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];
+					const uint32_t total_pixels = (uint32_t)cluster_indices.size() * 8;
 
-				cluster_optimizer_params.m_use_color4 = false;
-				cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
+					std::vector<color_rgba> cluster_pixels(total_pixels);
 
-				if (m_params.m_compression_level == 0)
-					cluster_optimizer_params.m_quality = cETCQualityMedium;
-				else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
-					cluster_optimizer_params.m_quality = cETCQualityUber;
-
-				etc1_optimizer::results cluster_optimizer_results;
-
-				std::vector<uint8_t> cluster_selectors(total_pixels);
-				cluster_optimizer_results.m_n = total_pixels;
-				cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
-
-				optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
-
-				optimizer.compute();
-
-				new_subblock_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
-				new_subblock_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
-				new_subblock_params.m_color_error[0] = cluster_optimizer_results.m_error;
-			} 
-
-			endpoint_cluster_etc_params &prev_etc_params = m_endpoint_cluster_etc_params[cluster_index];
-
-			bool use_new_subblock_params = false;
-			if ((!step) || (!prev_etc_params.m_valid))
-				use_new_subblock_params = true;
-			else
-			{
-				assert(prev_etc_params.m_valid);
-
-				uint64_t total_prev_err = 0;
-								
-				{
-					color_rgba block_colors[4];
-
-					etc_block::get_block_colors5(block_colors, prev_etc_params.m_color_unscaled[0], prev_etc_params.m_inten_table[0], false);
-
-					uint64_t total_err = 0;
-
-					for (uint32_t i = 0; i < total_pixels; i++)
+					for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 					{
-						const color_rgba &c = cluster_pixels[i];
+						const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+						const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
 
-						uint64_t best_err = UINT64_MAX;
-						uint32_t best_index = 0;
+						const bool flipped = true;
 
-						for (uint32_t s = 0; s < 4; s++)
+						const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+
+						for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)
 						{
-							uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);
-							if (err < best_err)
-							{
-								best_err = err;
-								best_index = s;
-							}
+							const color_rgba &c = pBlock_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];
+							cluster_pixels[cluster_indices_iter * 8 + pixel_index] = c;
 						}
-
-						total_err += best_err;
 					}
 
-					total_prev_err += total_err;
-				}
+					endpoint_cluster_etc_params new_subblock_params;
+						
+					{
+						etc1_optimizer optimizer;
+						etc1_solution_coordinates solutions[2];
 
-				// See if we should update this cluster's endpoints (if the error has actually fallen)
-				if (total_prev_err > (new_subblock_params.m_color_error[0] + new_subblock_params.m_color_error[1]))
-				{
-					use_new_subblock_params = true;
-				}
-			}
+						etc1_optimizer::params cluster_optimizer_params;
+						cluster_optimizer_params.m_num_src_pixels = total_pixels;
+						cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];
 
-			if (use_new_subblock_params)
-			{
-				new_subblock_params.m_valid = true;
+						cluster_optimizer_params.m_use_color4 = false;
+						cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
 
-				prev_etc_params = new_subblock_params;
-			}
+						if (m_params.m_compression_level == 0)
+							cluster_optimizer_params.m_quality = cETCQualityMedium;
+						else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+							cluster_optimizer_params.m_quality = cETCQualityUber;
 
-		} // cluster_index
+						etc1_optimizer::results cluster_optimizer_results;
+
+						std::vector<uint8_t> cluster_selectors(total_pixels);
+						cluster_optimizer_results.m_n = total_pixels;
+						cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
+
+						optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
+
+						optimizer.compute();
+
+						new_subblock_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
+						new_subblock_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
+						new_subblock_params.m_color_error[0] = cluster_optimizer_results.m_error;
+					} 
+
+					endpoint_cluster_etc_params &prev_etc_params = m_endpoint_cluster_etc_params[cluster_index];
+
+					bool use_new_subblock_params = false;
+					if ((!step) || (!prev_etc_params.m_valid))
+						use_new_subblock_params = true;
+					else
+					{
+						assert(prev_etc_params.m_valid);
+
+						uint64_t total_prev_err = 0;
+								
+						{
+							color_rgba block_colors[4];
+
+							etc_block::get_block_colors5(block_colors, prev_etc_params.m_color_unscaled[0], prev_etc_params.m_inten_table[0], false);
+
+							uint64_t total_err = 0;
+
+							for (uint32_t i = 0; i < total_pixels; i++)
+							{
+								const color_rgba &c = cluster_pixels[i];
+
+								uint64_t best_err = UINT64_MAX;
+								//uint32_t best_index = 0;
+
+								for (uint32_t s = 0; s < 4; s++)
+								{
+									uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);
+									if (err < best_err)
+									{
+										best_err = err;
+										//best_index = s;
+									}
+								}
+
+								total_err += best_err;
+							}
+
+							total_prev_err += total_err;
+						}
+
+						// See if we should update this cluster's endpoints (if the error has actually fallen)
+						if (total_prev_err > new_subblock_params.m_color_error[0])
+						{
+							use_new_subblock_params = true;
+						}
+					}
+
+					if (use_new_subblock_params)
+					{
+						new_subblock_params.m_valid = true;
+
+						prev_etc_params = new_subblock_params;
+					}
+				
+				} // cluster_index
+
+			} );
+
+		} // cluster_index_iter
+
+		m_params.m_pJob_pool->wait_for_all();
 	}
 
 	bool basisu_frontend::check_etc1s_constraints() const
@@ -854,78 +1074,91 @@
 
 		uint_vec best_cluster_indices(m_total_blocks);
 
-#pragma omp parallel for
-		for (int block_index = 0; block_index < (int)m_total_blocks; block_index++)
+		const uint32_t N = 1024;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 		{
-			const bool is_flipped = true;
-			
-			const uint32_t cluster_index = block_clusters[block_index][0];
-			BASISU_FRONTEND_VERIFY(cluster_index == block_clusters[block_index][1]);
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-			const color_rgba *subblock_pixels = get_source_pixel_block(block_index).get_ptr();
-			const uint32_t num_subblock_pixels = 16;
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &best_cluster_indices, &block_clusters] {
 
-			uint64_t best_cluster_err = UINT64_MAX;
-			uint32_t best_cluster_index = 0;
-
-			const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster.size() ? m_block_parent_endpoint_cluster[block_index] : 0;
-			const uint_vec *pCluster_indices = m_endpoint_clusters_within_each_parent_cluster.size() ? &m_endpoint_clusters_within_each_parent_cluster[block_parent_endpoint_cluster_index] : nullptr;
-
-			const uint32_t total_clusters = m_use_hierarchical_endpoint_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_endpoint_clusters.size();
-			
-			for (uint32_t i = 0; i < total_clusters; i++)
-			{
-				const uint32_t cluster_iter = m_use_hierarchical_endpoint_codebooks ? (*pCluster_indices)[i] : i;
-
-				color_rgba cluster_etc_base_color(m_endpoint_cluster_etc_params[cluster_iter].m_color_unscaled[0]);
-				uint32_t cluster_etc_inten = m_endpoint_cluster_etc_params[cluster_iter].m_inten_table[0];
-
-				uint64_t total_err = 0;
-
-				const uint32_t low_selector = 0;//subblock_etc_params_vec[j].m_low_selectors[0];
-				const uint32_t high_selector = 3;//subblock_etc_params_vec[j].m_high_selectors[0];
-				color_rgba subblock_colors[4];
-				// Can't assign it here - may result in too much error when selector quant occurs
-				if (cluster_etc_inten > m_endpoint_cluster_etc_params[cluster_index].m_inten_table[0])
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{
-					total_err = UINT64_MAX;
-					goto skip_cluster;
-				}
+					const bool is_flipped = true;
+			
+					const uint32_t cluster_index = block_clusters[block_index][0];
+					BASISU_FRONTEND_VERIFY(cluster_index == block_clusters[block_index][1]);
 
-				etc_block::get_block_colors5(subblock_colors, cluster_etc_base_color, cluster_etc_inten);
+					const color_rgba *subblock_pixels = get_source_pixel_block(block_index).get_ptr();
+					const uint32_t num_subblock_pixels = 16;
 
-				for (uint32_t p = 0; p < num_subblock_pixels; p++)
-				{
-					uint64_t best_err = UINT64_MAX;
+					uint64_t best_cluster_err = UINT64_MAX;
+					uint32_t best_cluster_index = 0;
 
-					for (uint32_t r = low_selector; r <= high_selector; r++)
+					const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster.size() ? m_block_parent_endpoint_cluster[block_index] : 0;
+					const uint_vec *pCluster_indices = m_endpoint_clusters_within_each_parent_cluster.size() ? &m_endpoint_clusters_within_each_parent_cluster[block_parent_endpoint_cluster_index] : nullptr;
+
+					const uint32_t total_clusters = m_use_hierarchical_endpoint_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_endpoint_clusters.size();
+			
+					for (uint32_t i = 0; i < total_clusters; i++)
 					{
-						uint64_t err = color_distance(m_params.m_perceptual, subblock_pixels[p], subblock_colors[r], false);
-						best_err = minimum(best_err, err);
-						if (!best_err)
-							break;
-					}
+						const uint32_t cluster_iter = m_use_hierarchical_endpoint_codebooks ? (*pCluster_indices)[i] : i;
 
-					total_err += best_err;
-					if (total_err > best_cluster_err)
-						break;
-				} // p
+						color_rgba cluster_etc_base_color(m_endpoint_cluster_etc_params[cluster_iter].m_color_unscaled[0]);
+						uint32_t cluster_etc_inten = m_endpoint_cluster_etc_params[cluster_iter].m_inten_table[0];
 
-			skip_cluster:
-				if ((total_err < best_cluster_err) ||
-					((cluster_iter == cluster_index) && (total_err == best_cluster_err)))
-				{
-					best_cluster_err = total_err;
-					best_cluster_index = cluster_iter;
+						uint64_t total_err = 0;
+
+						const uint32_t low_selector = 0;//subblock_etc_params_vec[j].m_low_selectors[0];
+						const uint32_t high_selector = 3;//subblock_etc_params_vec[j].m_high_selectors[0];
+						color_rgba subblock_colors[4];
+						// Can't assign it here - may result in too much error when selector quant occurs
+						if (cluster_etc_inten > m_endpoint_cluster_etc_params[cluster_index].m_inten_table[0])
+						{
+							total_err = UINT64_MAX;
+							goto skip_cluster;
+						}
+
+						etc_block::get_block_colors5(subblock_colors, cluster_etc_base_color, cluster_etc_inten);
+
+						for (uint32_t p = 0; p < num_subblock_pixels; p++)
+						{
+							uint64_t best_err = UINT64_MAX;
+
+							for (uint32_t r = low_selector; r <= high_selector; r++)
+							{
+								uint64_t err = color_distance(m_params.m_perceptual, subblock_pixels[p], subblock_colors[r], false);
+								best_err = minimum(best_err, err);
+								if (!best_err)
+									break;
+							}
+
+							total_err += best_err;
+							if (total_err > best_cluster_err)
+								break;
+						} // p
+
+					skip_cluster:
+						if ((total_err < best_cluster_err) ||
+							((cluster_iter == cluster_index) && (total_err == best_cluster_err)))
+						{
+							best_cluster_err = total_err;
+							best_cluster_index = cluster_iter;
 					
-					if (!best_cluster_err)
-						break;
-				}
-			} // j
+							if (!best_cluster_err)
+								break;
+						}
+					} // j
 						
-			best_cluster_indices[block_index] = best_cluster_index;
+					best_cluster_indices[block_index] = best_cluster_index;
 
-		} // block_index
+				} // block_index
+			
+			} );
+						
+		} // block_index_iter
+		
+		m_params.m_pJob_pool->wait_for_all();
 
 		std::vector<typename std::vector<uint32_t> > optimized_endpoint_clusters(m_endpoint_clusters.size());
 		uint32_t total_subblocks_reassigned = 0;
@@ -1024,29 +1257,42 @@
 	{
 		debug_printf("create_initial_packed_texture\n");
 
-#pragma omp parallel for
-		for (int block_index = 0; block_index < static_cast<int>(m_total_blocks); block_index++)
+		const uint32_t N = 4096;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 		{
-			uint32_t cluster0 = m_block_endpoint_clusters_indices[block_index][0];
-			uint32_t cluster1 = m_block_endpoint_clusters_indices[block_index][1];
-			BASISU_FRONTEND_VERIFY(cluster0 == cluster1);
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-			const color_rgba *pSource_pixels = get_source_pixel_block(block_index).get_ptr();
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+				
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					uint32_t cluster0 = m_block_endpoint_clusters_indices[block_index][0];
+					uint32_t cluster1 = m_block_endpoint_clusters_indices[block_index][1];
+					BASISU_FRONTEND_VERIFY(cluster0 == cluster1);
 
-			etc_block &blk = m_encoded_blocks[block_index];
+					const color_rgba *pSource_pixels = get_source_pixel_block(block_index).get_ptr();
 
-			color_rgba unscaled[2] = { m_endpoint_cluster_etc_params[cluster0].m_color_unscaled[0], m_endpoint_cluster_etc_params[cluster1].m_color_unscaled[0] };
-			uint32_t inten[2] = { m_endpoint_cluster_etc_params[cluster0].m_inten_table[0], m_endpoint_cluster_etc_params[cluster1].m_inten_table[0] };
+					etc_block &blk = m_encoded_blocks[block_index];
+
+					color_rgba unscaled[2] = { m_endpoint_cluster_etc_params[cluster0].m_color_unscaled[0], m_endpoint_cluster_etc_params[cluster1].m_color_unscaled[0] };
+					uint32_t inten[2] = { m_endpoint_cluster_etc_params[cluster0].m_inten_table[0], m_endpoint_cluster_etc_params[cluster1].m_inten_table[0] };
 									
-			blk.set_block_color5(unscaled[0], unscaled[1]);
-			blk.set_flip_bit(true);
+					blk.set_block_color5(unscaled[0], unscaled[1]);
+					blk.set_flip_bit(true);
 
-			blk.set_inten_table(0, inten[0]);
-			blk.set_inten_table(1, inten[1]);
+					blk.set_inten_table(0, inten[0]);
+					blk.set_inten_table(1, inten[1]);
 
-			blk.determine_selectors(pSource_pixels, m_params.m_perceptual);
+					blk.determine_selectors(pSource_pixels, m_params.m_perceptual);
 						
-		} // block_index
+				} // block_index
+			
+			} );
+
+		} // block_index_iter
+
+		m_params.m_pJob_pool->wait_for_all();
 
 		m_orig_encoded_blocks = m_encoded_blocks;
 	}
@@ -1093,64 +1339,85 @@
 		}
 	}
 
-	void basisu_frontend::create_selector_clusters()
+	void basisu_frontend::generate_selector_clusters()
 	{
-		debug_printf("create_selector_clusters\n");
+		debug_printf("generate_selector_clusters\n");
 
 		typedef vec<16, float> vec16F;
 		typedef tree_vector_quant<vec16F> vec16F_clusterizer;
 				
 		vec16F_clusterizer::array_of_weighted_training_vecs training_vecs(m_total_blocks);
 				
-#pragma omp parallel for
-		for (int block_index = 0; block_index < static_cast<int>(m_total_blocks); block_index++)
+		const uint32_t N = 4096;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 		{
-			const etc_block &blk = m_encoded_blocks[block_index];
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-			vec16F v;
-			for (uint32_t y = 0; y < 4; y++)
-				for (uint32_t x = 0; x < 4; x++)
-					v[x + y * 4] = static_cast<float>(blk.get_selector(x, y));
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {
 
-			const uint32_t subblock_index = (blk.get_inten_table(0) > blk.get_inten_table(1)) ? 0 : 1;
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					const etc_block &blk = m_encoded_blocks[block_index];
 
-			color_rgba block_colors[2];
-			blk.get_block_low_high_colors(block_colors, subblock_index);
+					vec16F v;
+					for (uint32_t y = 0; y < 4; y++)
+						for (uint32_t x = 0; x < 4; x++)
+							v[x + y * 4] = static_cast<float>(blk.get_selector(x, y));
 
-			const uint32_t dist = color_distance(m_params.m_perceptual, block_colors[0], block_colors[1], false);
+					const uint32_t subblock_index = (blk.get_inten_table(0) > blk.get_inten_table(1)) ? 0 : 1;
 
-			const uint32_t cColorDistToWeight = 300;
-			const uint32_t cMaxWeight = 4096;
-			uint32_t weight = clamp<uint32_t>(dist / cColorDistToWeight, 1, cMaxWeight);
+					color_rgba block_colors[2];
+					blk.get_block_low_high_colors(block_colors, subblock_index);
+
+					const uint32_t dist = color_distance(m_params.m_perceptual, block_colors[0], block_colors[1], false);
+
+					const uint32_t cColorDistToWeight = 300;
+					const uint32_t cMaxWeight = 4096;
+					uint32_t weight = clamp<uint32_t>(dist / cColorDistToWeight, 1, cMaxWeight);
 						
-			training_vecs[block_index].first = v;
-			training_vecs[block_index].second = weight;
-		}
+					training_vecs[block_index].first = v;
+					training_vecs[block_index].second = weight;
+				
+				} // block_index
+
+			} );
+
+		} // block_index_iter
+
+		m_params.m_pJob_pool->wait_for_all();
 
 		vec16F_clusterizer selector_clusterizer;
 		for (uint32_t i = 0; i < m_total_blocks; i++)
 			selector_clusterizer.add_training_vec(training_vecs[i].first, training_vecs[i].second);
 
-		selector_clusterizer.generate(m_params.m_max_selector_clusters);
+		const uint32_t parent_codebook_size = (m_params.m_max_selector_clusters >= 256) ? BASISU_SELECTOR_PARENT_CODEBOOK_SIZE : 0;
 
-		selector_clusterizer.retrieve(m_selector_cluster_indices);
+		uint32_t max_threads = 0;
+		max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;
+
+		bool status = generate_hierarchical_codebook_threaded(selector_clusterizer,
+			m_params.m_max_selector_clusters, m_use_hierarchical_selector_codebooks ? parent_codebook_size : 0,
+			m_selector_cluster_indices,
+			m_selector_parent_cluster_indices,
+			max_threads, m_params.m_pJob_pool);
+		BASISU_FRONTEND_VERIFY(status);
 
 		if (m_use_hierarchical_selector_codebooks)
 		{
-			if (m_selector_cluster_indices.size() < BASISU_SELECTOR_PARENT_CODEBOOK_SIZE)
+			if (!m_selector_parent_cluster_indices.size())
 			{
 				m_selector_parent_cluster_indices.resize(0);
 				m_selector_parent_cluster_indices.resize(1);
 				for (uint32_t i = 0; i < m_total_blocks; i++)
 					m_selector_parent_cluster_indices[0].push_back(i);
 			}
-			else
-				selector_clusterizer.retrieve(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE, m_selector_parent_cluster_indices);
 
 			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE <= UINT8_MAX);
 
 			m_block_parent_selector_cluster.resize(0);
 			m_block_parent_selector_cluster.resize(m_total_blocks);
+			vector_set_all(m_block_parent_selector_cluster, 0xFF);
 
 			for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_selector_parent_cluster_indices.size(); parent_cluster_index++)
 			{
@@ -1158,6 +1425,10 @@
 				for (uint32_t j = 0; j < cluster.size(); j++)
 					m_block_parent_selector_cluster[cluster[j]] = static_cast<uint8_t>(parent_cluster_index);
 			}
+			for (uint32_t i = 0; i < m_total_blocks; i++)
+			{
+				BASISU_FRONTEND_VERIFY(m_block_parent_selector_cluster[i] != 0xFF);
+			}
 
 			// Ensure that all the blocks within each cluster are all in the same parent cluster, or something is very wrong.
 			for (uint32_t cluster_index = 0; cluster_index < m_selector_cluster_indices.size(); cluster_index++)
@@ -1197,56 +1468,70 @@
 
 			m_optimized_cluster_selector_global_cb_ids.resize(total_selector_clusters);
 
-#pragma omp parallel for
-			for (int cluster_index = 0; cluster_index < static_cast<int>(total_selector_clusters); cluster_index++)
+			const uint32_t N = 256;
+			for (uint32_t cluster_index_iter = 0; cluster_index_iter < total_selector_clusters; cluster_index_iter += N)
 			{
-				const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
+				const uint32_t first_index = cluster_index_iter;                                    
+				const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);   
+			
+				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &total_clusters_processed, &total_selector_clusters] {
+					
+					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+					{
+						const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
 
-				if (!cluster_block_indices.size())
-					continue;
+						if (!cluster_block_indices.size())
+							continue;
 
-				etc_block_vec etc_blocks;
-				pixel_block_vec pixel_blocks;
+						etc_block_vec etc_blocks;
+						pixel_block_vec pixel_blocks;
 
-				for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
-				{
-					const uint32_t block_index = cluster_block_indices[cluster_block_index];
+						for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
+						{
+							const uint32_t block_index = cluster_block_indices[cluster_block_index];
 
-					etc_blocks.push_back(m_encoded_blocks[block_index]);
+							etc_blocks.push_back(m_encoded_blocks[block_index]);
 
-					pixel_blocks.push_back(get_source_pixel_block(block_index));
-				}
+							pixel_blocks.push_back(get_source_pixel_block(block_index));
+						}
 
-				uint32_t palette_index;
-				basist::etc1_global_palette_entry_modifier palette_modifier;
+						uint32_t palette_index;
+						basist::etc1_global_palette_entry_modifier palette_modifier;
 
-#if 0
-				m_params.m_pGlobal_sel_codebook->find_best_entry(etc_blocks.size(), pixel_blocks.get_ptr(), etc_blocks.get_ptr(),
-					palette_index, palette_modifier,
-					m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
-#else
-				etc1_global_selector_codebook_find_best_entry(*m_params.m_pGlobal_sel_codebook,
-					(uint32_t)etc_blocks.size(), &pixel_blocks[0], &etc_blocks[0],
-					palette_index, palette_modifier,
-					m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
-#endif
+		#if 0
+						m_params.m_pGlobal_sel_codebook->find_best_entry(etc_blocks.size(), pixel_blocks.get_ptr(), etc_blocks.get_ptr(),
+							palette_index, palette_modifier,
+							m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
+		#else
+						etc1_global_selector_codebook_find_best_entry(*m_params.m_pGlobal_sel_codebook,
+							(uint32_t)etc_blocks.size(), &pixel_blocks[0], &etc_blocks[0],
+							palette_index, palette_modifier,
+							m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
+		#endif
 
-				m_optimized_cluster_selector_global_cb_ids[cluster_index].set(palette_index, palette_modifier);
+						m_optimized_cluster_selector_global_cb_ids[cluster_index].set(palette_index, palette_modifier);
 
-				basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(palette_index, palette_modifier));
+						basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(palette_index, palette_modifier));
 
-				for (uint32_t y = 0; y < 4; y++)
-					for (uint32_t x = 0; x < 4; x++)
-						m_optimized_cluster_selectors[cluster_index].set_selector(x, y, pal_entry(x, y));
+						for (uint32_t y = 0; y < 4; y++)
+							for (uint32_t x = 0; x < 4; x++)
+								m_optimized_cluster_selectors[cluster_index].set_selector(x, y, pal_entry(x, y));
 
-#pragma omp critical
-				{
-					total_clusters_processed++;
-					if ((total_clusters_processed % 63) == 0)
-						debug_printf("Global selector palette optimization: %3.1f%% complete\n", total_clusters_processed * 100.0f / total_selector_clusters);
-				}
+						{
+							std::lock_guard<std::mutex> lock(m_lock);
 
-			} // cluster_index
+							total_clusters_processed++;
+							if ((total_clusters_processed % 63) == 0)
+								debug_printf("Global selector palette optimization: %3.1f%% complete\n", total_clusters_processed * 100.0f / total_selector_clusters);
+						}
+
+					} // cluster_index
+
+				} );
+
+			} // cluster_index_iter
+
+			m_params.m_pJob_pool->wait_for_all();
 		}
 		else
 		{
@@ -1261,119 +1546,131 @@
 
 			// For each selector codebook entry, and for each of the 4x4 selectors, determine which selector minimizes the error across all the blocks that use that quantized selector.
 
-#pragma omp parallel for
-			for (int cluster_index = 0; cluster_index < static_cast<int>(total_selector_clusters); cluster_index++)
+			const uint32_t N = 256;
+			for (uint32_t cluster_index_iter = 0; cluster_index_iter < total_selector_clusters; cluster_index_iter += N)
 			{
-				const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
-
-				if (!cluster_block_indices.size())
-					continue;
-
-				uint64_t overall_best_err = 0;
-
-				for (uint32_t y = 0; y < 4; y++)
-				{
-					for (uint32_t x = 0; x < 4; x++)
+				const uint32_t first_index = cluster_index_iter;                                    
+				const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);   
+			
+				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &uses_hybrid_sel_codebook, &total_clusters_processed, &total_selector_clusters] {
+					
+					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 					{
-						uint64_t best_err = UINT64_MAX;
-						uint32_t best_s = 0;
+						const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
 
-						for (uint32_t s = 0; s < 4; s++)
+						if (!cluster_block_indices.size())
+							continue;
+
+						uint64_t overall_best_err = 0;
+
+						for (uint32_t y = 0; y < 4; y++)
 						{
-							uint32_t total_err = 0;
+							for (uint32_t x = 0; x < 4; x++)
+							{
+								uint64_t best_err = UINT64_MAX;
+								uint32_t best_s = 0;
+
+								for (uint32_t s = 0; s < 4; s++)
+								{
+									uint32_t total_err = 0;
+
+									for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
+									{
+										const uint32_t block_index = cluster_block_indices[cluster_block_index];
+
+										const etc_block &blk = m_encoded_blocks[block_index];
+
+										const color_rgba &orig_color = get_source_pixel_block(block_index)(x, y);
+
+										color_rgba block_color;
+										blk.get_block_color(block_color, blk.get_subblock_index(x, y), s);
+										total_err += color_distance(m_params.m_perceptual, block_color, orig_color, false);
+
+										if (total_err > best_err)
+											break;
+
+									} // block_index
+
+									if (total_err < best_err)
+									{
+										best_err = total_err;
+										best_s = s;
+										if (!best_err)
+											break;
+									}
+
+								} // s
+
+								m_optimized_cluster_selectors[cluster_index].set_selector(x, y, best_s);
+
+								overall_best_err += best_err;
+
+							} // x
+						} // y
+
+						if (uses_hybrid_sel_codebook)
+						{
+							etc_block_vec etc_blocks;
+							pixel_block_vec pixel_blocks;
 
 							for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
 							{
 								const uint32_t block_index = cluster_block_indices[cluster_block_index];
 
-								const etc_block &blk = m_encoded_blocks[block_index];
+								etc_blocks.push_back(m_encoded_blocks[block_index]);
 
-								const color_rgba &orig_color = get_source_pixel_block(block_index)(x, y);
-
-								color_rgba block_color;
-								blk.get_block_color(block_color, blk.get_subblock_index(x, y), s);
-								total_err += color_distance(m_params.m_perceptual, block_color, orig_color, false);
-
-								if (total_err > best_err)
-									break;
-
-							} // block_index
-
-							if (total_err < best_err)
-							{
-								best_err = total_err;
-								best_s = s;
-								if (!best_err)
-									break;
+								pixel_blocks.push_back(get_source_pixel_block(block_index));
 							}
 
-						} // s
+							uint32_t palette_index;
+							basist::etc1_global_palette_entry_modifier palette_modifier;
 
-						m_optimized_cluster_selectors[cluster_index].set_selector(x, y, best_s);
+		#if 0
+							uint64_t best_global_cb_err = m_params.m_pGlobal_sel_codebook->find_best_entry(etc_blocks.size(), pixel_blocks.get_ptr(), etc_blocks.get_ptr(),
+								palette_index, palette_modifier,
+								m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
+		#else
+							uint64_t best_global_cb_err = etc1_global_selector_codebook_find_best_entry(*m_params.m_pGlobal_sel_codebook, (uint32_t)etc_blocks.size(), &pixel_blocks[0], &etc_blocks[0],
+								palette_index, palette_modifier,
+								m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
+		#endif
 
-						overall_best_err += best_err;
+							if (best_global_cb_err <= overall_best_err * m_params.m_hybrid_codebook_quality_thresh)
+							{
+								m_selector_cluster_uses_global_cb[cluster_index] = true;
 
-					} // x
-				} // y
+								m_optimized_cluster_selector_global_cb_ids[cluster_index].set(palette_index, palette_modifier);
 
-				if (uses_hybrid_sel_codebook)
-				{
-					etc_block_vec etc_blocks;
-					pixel_block_vec pixel_blocks;
+								basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(palette_index, palette_modifier));
 
-					for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
-					{
-						const uint32_t block_index = cluster_block_indices[cluster_block_index];
+								for (uint32_t y = 0; y < 4; y++)
+									for (uint32_t x = 0; x < 4; x++)
+										m_optimized_cluster_selectors[cluster_index].set_selector(x, y, pal_entry(x, y));
+							}
+							else
+							{
+								m_optimized_cluster_selector_global_cb_ids[cluster_index].set(0, basist::etc1_global_palette_entry_modifier(0));
 
-						etc_blocks.push_back(m_encoded_blocks[block_index]);
+								m_selector_cluster_uses_global_cb[cluster_index] = false;
+							}
+						}
 
-						pixel_blocks.push_back(get_source_pixel_block(block_index));
-					}
+						if (uses_hybrid_sel_codebook)
+						{
+							std::lock_guard<std::mutex> lock(m_lock);
+		
+							total_clusters_processed++;
+							if ((total_clusters_processed % 63) == 0)
+								debug_printf("Global selector palette optimization: %3.1f%% complete\n", total_clusters_processed * 100.0f / total_selector_clusters);
+						}
 
-					uint32_t palette_index;
-					basist::etc1_global_palette_entry_modifier palette_modifier;
+					} // cluster_index
 
-#if 0
-					uint64_t best_global_cb_err = m_params.m_pGlobal_sel_codebook->find_best_entry(etc_blocks.size(), pixel_blocks.get_ptr(), etc_blocks.get_ptr(),
-						palette_index, palette_modifier,
-						m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
-#else
-					uint64_t best_global_cb_err = etc1_global_selector_codebook_find_best_entry(*m_params.m_pGlobal_sel_codebook, (uint32_t)etc_blocks.size(), &pixel_blocks[0], &etc_blocks[0],
-						palette_index, palette_modifier,
-						m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
-#endif
+				} );
 
-					if (best_global_cb_err <= overall_best_err * m_params.m_hybrid_codebook_quality_thresh)
-					{
-						m_selector_cluster_uses_global_cb[cluster_index] = true;
+			} // cluster_index_iter
 
-						m_optimized_cluster_selector_global_cb_ids[cluster_index].set(palette_index, palette_modifier);
-
-						basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(palette_index, palette_modifier));
-
-						for (uint32_t y = 0; y < 4; y++)
-							for (uint32_t x = 0; x < 4; x++)
-								m_optimized_cluster_selectors[cluster_index].set_selector(x, y, pal_entry(x, y));
-					}
-					else
-					{
-						m_optimized_cluster_selector_global_cb_ids[cluster_index].set(0, basist::etc1_global_palette_entry_modifier(0));
-
-						m_selector_cluster_uses_global_cb[cluster_index] = false;
-					}
-				}
-
-				if (uses_hybrid_sel_codebook)
-				{
-#pragma omp critical
-					{
-						total_clusters_processed++;
-						if ((total_clusters_processed % 63) == 0)
-							debug_printf("Global selector palette optimization: %3.1f%% complete\n", total_clusters_processed * 100.0f / total_selector_clusters);
-					}
-				}
-
-			} // cluster_index
+			m_params.m_pJob_pool->wait_for_all();
 
 		} // if (m_params.m_pGlobal_sel_codebook)
 
@@ -1410,11 +1707,7 @@
 				}
 
 				char buf[256];
-#ifdef _WIN32				
-				sprintf_s(buf, sizeof(buf), "selector_cluster_vis_%u.png", iter);
-#else
 				snprintf(buf, sizeof(buf), "selector_cluster_vis_%u.png", iter);
-#endif				
 				save_png(buf, selector_cluster_vis);
 			}
 		}
@@ -1441,66 +1734,81 @@
 
 			// For each block: Determine which quantized selectors best encode that block, given its quantized endpoints.
 
-	#pragma omp parallel for
-			for (int block_index = 0; block_index < static_cast<int>(m_total_blocks); block_index++)
+			const uint32_t N = 1024;
+			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 			{
-				const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
-				etc_block& blk = m_encoded_blocks[block_index];
+				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &new_cluster_indices] {
+
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+
+					etc_block& blk = m_encoded_blocks[block_index];
 			
-				color_rgba trial_block_colors[4];
-				blk.get_block_colors(trial_block_colors, 0);
+					color_rgba trial_block_colors[4];
+					blk.get_block_colors(trial_block_colors, 0);
 
-				uint64_t best_cluster_err = UINT64_MAX;
-				uint32_t best_cluster_index = 0;
+					uint64_t best_cluster_err = UINT64_MAX;
+					uint32_t best_cluster_index = 0;
 
-				const uint32_t parent_selector_cluster = m_block_parent_selector_cluster.size() ? m_block_parent_selector_cluster[block_index] : 0;
-				const uint_vec *pCluster_indices = m_selector_clusters_within_each_parent_cluster.size() ? &m_selector_clusters_within_each_parent_cluster[parent_selector_cluster] : nullptr;
+					const uint32_t parent_selector_cluster = m_block_parent_selector_cluster.size() ? m_block_parent_selector_cluster[block_index] : 0;
+					const uint_vec *pCluster_indices = m_selector_clusters_within_each_parent_cluster.size() ? &m_selector_clusters_within_each_parent_cluster[parent_selector_cluster] : nullptr;
 
-				const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_indices.size();
+					const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_indices.size();
 
-				for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
-				{
-					const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
-
-					const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
-								
-					uint64_t trial_err = 0;
-					for (int y = 0; y < 4; y++)
+					for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
 					{
-						for (int x = 0; x < 4; x++)
+						const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+
+						const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+								
+						uint64_t trial_err = 0;
+						for (int y = 0; y < 4; y++)
 						{
-							const uint32_t sel = cluster_blk.get_selector(x, y);
+							for (int x = 0; x < 4; x++)
+							{
+								const uint32_t sel = cluster_blk.get_selector(x, y);
 
-							trial_err += color_distance(m_params.m_perceptual, trial_block_colors[sel], pBlock_pixels[x + y * 4], false);
-							if (trial_err > best_cluster_err)
-								goto early_out;
+								trial_err += color_distance(m_params.m_perceptual, trial_block_colors[sel], pBlock_pixels[x + y * 4], false);
+								if (trial_err > best_cluster_err)
+									goto early_out;
+							}
 						}
-					}
 								
-					if (trial_err < best_cluster_err)
-					{
-						best_cluster_err = trial_err;
-						best_cluster_index = cluster_index;
-						if (!best_cluster_err) 
-							break;
+						if (trial_err < best_cluster_err)
+						{
+							best_cluster_err = trial_err;
+							best_cluster_index = cluster_index;
+							if (!best_cluster_err) 
+								break;
+						}
+
+					early_out:
+						;
 					}
 
-				early_out:
-					;
-				}
+					blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());
 
-				blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());
+					m_block_selector_cluster_index[block_index] = best_cluster_index;
+	
+					{
+						std::lock_guard<std::mutex> lock(m_lock);
 
-				m_block_selector_cluster_index[block_index] = best_cluster_index;
+						vector_ensure_element_is_valid(new_cluster_indices, best_cluster_index);
+						new_cluster_indices[best_cluster_index].push_back(block_index);
+					}
+					
+				} // block_index
 
-	#pragma omp critical
-				{
-					vector_ensure_element_is_valid(new_cluster_indices, best_cluster_index);
-					new_cluster_indices[best_cluster_index].push_back(block_index);
-				}
-			}
+				} );
 
+			} // block_index_iter
+						
+			m_params.m_pJob_pool->wait_for_all();
+			
 			m_selector_cluster_indices.swap(new_cluster_indices);
 		}
 
@@ -1697,7 +2005,7 @@
 		return total_subblocks_refined;
 	}
 
-	void basisu_frontend::dump_endpoint_clusterization_visualization(const char *pFilename)
+	void basisu_frontend::dump_endpoint_clusterization_visualization(const char *pFilename, bool vis_endpoint_colors)
 	{
 		debug_printf("dump_endpoint_clusterization_visualization\n");
 
@@ -1744,19 +2052,31 @@
 				const uint32_t block_index = training_vector_index >> 1;
 				const uint32_t subblock_index = training_vector_index & 1;
 
+				const etc_block& blk2 = m_etc1_blocks_etc1s[block_index];
+
 				const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
 
-				const etc_block &blk2 = m_etc1_blocks_etc1s[block_index];
-
 				color_rgba subblock_pixels[8];
-				for (uint32_t i = 0; i < 8; i++)
-					subblock_pixels[i] = pBlock_pixels[g_etc1_pixel_indices[blk2.get_flip_bit()][subblock_index][i]];
+
+				if (vis_endpoint_colors)
+				{
+					color_rgba colors[2];
+					blk2.get_block_low_high_colors(colors, subblock_index);
+					for (uint32_t i = 0; i < 8; i++)
+						subblock_pixels[i] = colors[subblock_index];
+				}
+				else
+				{
+					for (uint32_t i = 0; i < 8; i++)
+						subblock_pixels[i] = pBlock_pixels[g_etc1_pixel_indices[blk2.get_flip_bit()][subblock_index][i]];
+				}
 
 				endpoint_cluster_vis.set_block_clipped(subblock_pixels, 12 + 5 * subblock_iter, 3 * unsorted_cluster_iter, 4, 2);
 			}
 		}
 
 		save_png(pFilename, endpoint_cluster_vis);
+		debug_printf("Wrote debug visualization file %s\n", pFilename);
 	}
 
 	void basisu_frontend::finalize()
@@ -1786,91 +2106,102 @@
 		std::vector<uint8_t> cluster_valid(new_endpoint_cluster_block_indices.size());
 		std::vector<uint8_t> cluster_improved(new_endpoint_cluster_block_indices.size());
 		
-#pragma omp parallel for
-		for (int cluster_index = 0; cluster_index < static_cast<int>(new_endpoint_cluster_block_indices.size()); cluster_index++)
+		const uint32_t N = 256;
+		for (uint32_t cluster_index_iter = 0; cluster_index_iter < new_endpoint_cluster_block_indices.size(); cluster_index_iter += N)
 		{
-			const std::vector<uint32_t>& cluster_block_indices = new_endpoint_cluster_block_indices[cluster_index];
+			const uint32_t first_index = cluster_index_iter;                                    
+			const uint32_t last_index = minimum<uint32_t>((uint32_t)new_endpoint_cluster_block_indices.size(), cluster_index_iter + N);   
+			
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &cluster_improved, &cluster_valid, &new_endpoint_cluster_block_indices, &pBlock_selector_indices ] {
+				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+				{
+					const std::vector<uint32_t>& cluster_block_indices = new_endpoint_cluster_block_indices[cluster_index];
 
-			if (!cluster_block_indices.size())
-				continue;
+					if (!cluster_block_indices.size())
+						continue;
 
-			const uint32_t total_pixels = (uint32_t)cluster_block_indices.size() * 16;
+					const uint32_t total_pixels = (uint32_t)cluster_block_indices.size() * 16;
 
-			std::vector<color_rgba> cluster_pixels(total_pixels);
-			uint8_vec force_selectors(total_pixels);
+					std::vector<color_rgba> cluster_pixels(total_pixels);
+					uint8_vec force_selectors(total_pixels);
 
-			etc_block blk;
-			blk.set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(cluster_index, false));
-			blk.set_inten_tables_etc1s(get_endpoint_cluster_inten_table(cluster_index, false));
-			blk.set_flip_bit(true);
+					etc_block blk;
+					blk.set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(cluster_index, false));
+					blk.set_inten_tables_etc1s(get_endpoint_cluster_inten_table(cluster_index, false));
+					blk.set_flip_bit(true);
 						
-			uint64_t cur_err = 0;
+					uint64_t cur_err = 0;
 
-			for (uint32_t cluster_block_indices_iter = 0; cluster_block_indices_iter < cluster_block_indices.size(); cluster_block_indices_iter++)
-			{
-				const uint32_t block_index = cluster_block_indices[cluster_block_indices_iter];
+					for (uint32_t cluster_block_indices_iter = 0; cluster_block_indices_iter < cluster_block_indices.size(); cluster_block_indices_iter++)
+					{
+						const uint32_t block_index = cluster_block_indices[cluster_block_indices_iter];
 				
-				const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+						const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
 
-				memcpy(&cluster_pixels[cluster_block_indices_iter * 16], pBlock_pixels, 16 * sizeof(color_rgba));
+						memcpy(&cluster_pixels[cluster_block_indices_iter * 16], pBlock_pixels, 16 * sizeof(color_rgba));
 
-				const uint32_t selector_cluster_index = pBlock_selector_indices ? (*pBlock_selector_indices)[block_index] : get_block_selector_cluster_index(block_index);
+						const uint32_t selector_cluster_index = pBlock_selector_indices ? (*pBlock_selector_indices)[block_index] : get_block_selector_cluster_index(block_index);
 
-				const etc_block &blk_selectors = get_selector_cluster_selector_bits(selector_cluster_index);
+						const etc_block &blk_selectors = get_selector_cluster_selector_bits(selector_cluster_index);
 
-				blk.set_raw_selector_bits(blk_selectors.get_raw_selector_bits());
+						blk.set_raw_selector_bits(blk_selectors.get_raw_selector_bits());
 
-				cur_err += blk.evaluate_etc1_error(pBlock_pixels, m_params.m_perceptual);
+						cur_err += blk.evaluate_etc1_error(pBlock_pixels, m_params.m_perceptual);
 				
-				for (uint32_t y = 0; y < 4; y++)
-					for (uint32_t x = 0; x < 4; x++)
-						force_selectors[cluster_block_indices_iter * 16 + x + y * 4] = static_cast<uint8_t>(blk_selectors.get_selector(x, y));
-			}
+						for (uint32_t y = 0; y < 4; y++)
+							for (uint32_t x = 0; x < 4; x++)
+								force_selectors[cluster_block_indices_iter * 16 + x + y * 4] = static_cast<uint8_t>(blk_selectors.get_selector(x, y));
+					}
 
-			endpoint_cluster_etc_params new_endpoint_cluster_etc_params;
+					endpoint_cluster_etc_params new_endpoint_cluster_etc_params;
 						
-			{
-				etc1_optimizer optimizer;
-				etc1_solution_coordinates solutions[2];
+					{
+						etc1_optimizer optimizer;
+						etc1_solution_coordinates solutions[2];
 
-				etc1_optimizer::params cluster_optimizer_params;
-				cluster_optimizer_params.m_num_src_pixels = total_pixels;
-				cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];
+						etc1_optimizer::params cluster_optimizer_params;
+						cluster_optimizer_params.m_num_src_pixels = total_pixels;
+						cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];
 
-				cluster_optimizer_params.m_use_color4 = false;
-				cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
-				cluster_optimizer_params.m_pForce_selectors = &force_selectors[0];
+						cluster_optimizer_params.m_use_color4 = false;
+						cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
+						cluster_optimizer_params.m_pForce_selectors = &force_selectors[0];
 
-				if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
-					cluster_optimizer_params.m_quality = cETCQualityUber;
+						if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+							cluster_optimizer_params.m_quality = cETCQualityUber;
 
-				etc1_optimizer::results cluster_optimizer_results;
+						etc1_optimizer::results cluster_optimizer_results;
 
-				std::vector<uint8_t> cluster_selectors(total_pixels);
-				cluster_optimizer_results.m_n = total_pixels;
-				cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
+						std::vector<uint8_t> cluster_selectors(total_pixels);
+						cluster_optimizer_results.m_n = total_pixels;
+						cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
 
-				optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
+						optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
 
-				optimizer.compute();
+						optimizer.compute();
 
-				new_endpoint_cluster_etc_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
-				new_endpoint_cluster_etc_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
-				new_endpoint_cluster_etc_params.m_color_error[0] = cluster_optimizer_results.m_error;
-				new_endpoint_cluster_etc_params.m_color_used[0] = true;
-				new_endpoint_cluster_etc_params.m_valid = true;
-			}
+						new_endpoint_cluster_etc_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
+						new_endpoint_cluster_etc_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
+						new_endpoint_cluster_etc_params.m_color_error[0] = cluster_optimizer_results.m_error;
+						new_endpoint_cluster_etc_params.m_color_used[0] = true;
+						new_endpoint_cluster_etc_params.m_valid = true;
+					}
 
-			if (new_endpoint_cluster_etc_params.m_color_error[0] < cur_err)
-			{
-				m_endpoint_cluster_etc_params[cluster_index] = new_endpoint_cluster_etc_params;
+					if (new_endpoint_cluster_etc_params.m_color_error[0] < cur_err)
+					{
+						m_endpoint_cluster_etc_params[cluster_index] = new_endpoint_cluster_etc_params;
 				
-				cluster_improved[cluster_index] = true;
-			}
+						cluster_improved[cluster_index] = true;
+					}
 
-			cluster_valid[cluster_index] = true;
+					cluster_valid[cluster_index] = true;
 
-		} // cluster_index
+				} // cluster_index
+			} );
+
+		} // cluster_index_iter
+
+		m_params.m_pJob_pool->wait_for_all();
 				
 		uint32_t total_unused_clusters = 0;
 		uint32_t total_improved_clusters = 0;
@@ -1980,7 +2311,8 @@
 
 		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
 		{
-#define CHECK(x) do { if (!(x)) return false; } while(0)
+//#define CHECK(x) do { if (!(x)) { DebugBreak(); return false; } } while(0)
+#define CHECK(x) BASISU_FRONTEND_VERIFY(x);
 
 			CHECK(get_output_block(block_index).get_flip_bit() == true);
 			
diff --git a/basisu_frontend.h b/basisu_frontend.h
index 2a630e0..4543677 100644
--- a/basisu_frontend.h
+++ b/basisu_frontend.h
@@ -17,6 +17,7 @@
 #include "basisu_etc.h"
 #include "basisu_gpu_texture.h"
 #include "basisu_global_selector_palette_helpers.h"
+#include "transcoder/basisu_file_headers.h"
 
 namespace basisu
 {
@@ -71,13 +72,17 @@
 				m_perceptual(true),
 				m_debug_stats(false),
 				m_debug_images(false),
-				m_dump_endpoint_clusterization(false),
+				m_dump_endpoint_clusterization(true),
 				m_pGlobal_sel_codebook(NULL),
 				m_num_global_sel_codebook_pal_bits(0),
 				m_num_global_sel_codebook_mod_bits(0),
 				m_use_hybrid_selector_codebooks(false),
 				m_hybrid_codebook_quality_thresh(0.0f),
-				m_validate(false)
+				m_validate(false),
+				m_tex_type(basist::cBASISTexType2D),
+				m_multithreaded(false),
+				m_disable_hierarchical_endpoint_codebooks(false),
+				m_pJob_pool(nullptr)
 			{
 			}
 
@@ -94,12 +99,17 @@
 			bool m_debug_images;
 			bool m_dump_endpoint_clusterization;
 			bool m_validate;
+			bool m_multithreaded;
+			bool m_disable_hierarchical_endpoint_codebooks;
 			
 			const basist::etc1_global_selector_codebook *m_pGlobal_sel_codebook;
 			uint32_t m_num_global_sel_codebook_pal_bits;
 			uint32_t m_num_global_sel_codebook_mod_bits;
 			bool m_use_hybrid_selector_codebooks;
 			float m_hybrid_codebook_quality_thresh;
+			basist::basis_texture_type m_tex_type;
+			
+			job_pool *m_pJob_pool;
 		};
 
 		bool init(const params &p);
@@ -313,11 +323,13 @@
 		// The sorted subblock endpoint quant error for each endpoint cluster
 		std::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;
 
+		std::mutex m_lock;
+
 		//-----------------------------------------------------------------------------
 
 		void init_etc1_images();
 		void init_endpoint_training_vectors();
-		void dump_endpoint_clusterization_visualization(const char *pFilename);
+		void dump_endpoint_clusterization_visualization(const char *pFilename, bool vis_endpoint_colors);
 		void generate_endpoint_clusters();
 		void compute_endpoint_subblock_error_vec();
 		void introduce_new_endpoint_clusters();
@@ -328,12 +340,13 @@
 		void compute_endpoint_clusters_within_each_parent_cluster();
 		void compute_selector_clusters_within_each_parent_cluster();
 		void create_initial_packed_texture();
-		void create_selector_clusters();
+		void generate_selector_clusters();
 		void create_optimized_selector_codebook(uint32_t iter);
 		void find_optimal_selector_clusters_for_each_block();
 		uint32_t refine_block_endpoints_given_selectors();
 		void finalize();
 		bool validate_output() const;
+		void introduce_special_selector_clusters();
 		void optimize_selector_codebook();
 		bool check_etc1s_constraints() const;
 	};
diff --git a/basisu_ssim.cpp b/basisu_ssim.cpp
index 2c92885..5ca0cbe 100644
--- a/basisu_ssim.cpp
+++ b/basisu_ssim.cpp
@@ -105,7 +105,7 @@
 
 		dst.crop(dst_width, dst_height);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int oy = 0; oy < dst_height; oy++)
 		{
 			for (int ox = 0; ox < dst_width; ox++)
@@ -139,7 +139,7 @@
 	{
 		dst.resize(src);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int y = 0; y < (int)dst.get_height(); y++)
 		{
 			for (uint32_t x = 0; x < dst.get_width(); x++)
@@ -158,7 +158,7 @@
 	{
 		dst.resize(src);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int y = 0; y < (int)dst.get_height(); y++)
 		{
 			for (uint32_t x = 0; x < dst.get_width(); x++)
@@ -173,7 +173,7 @@
 	{
 		dst.resize(src);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int y = 0; y < (int)dst.get_height(); y++)
 		{
 			for (uint32_t x = 0; x < dst.get_width(); x++)
@@ -194,7 +194,7 @@
 	{
 		dst.resize(src1);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int y = 0; y < (int)dst.get_height(); y++)
 		{
 			for (uint32_t x = 0; x < dst.get_width(); x++)
@@ -215,7 +215,7 @@
 	{
 		dst.resize(src1);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int y = 0; y < (int)dst.get_height(); y++)
 		{
 			for (uint32_t x = 0; x < dst.get_width(); x++)
@@ -232,7 +232,7 @@
 	{
 		dst.resize(src);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int y = 0; y < (int)dst.get_height(); y++)
 		{
 			for (uint32_t x = 0; x < dst.get_width(); x++)
@@ -248,7 +248,7 @@
 	{
 		dst.resize(src1);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int y = 0; y < (int)dst.get_height(); y++)
 		{
 			for (uint32_t x = 0; x < dst.get_width(); x++)
@@ -274,7 +274,7 @@
 	{
 		dst.resize(src1);
 
-#pragma omp parallel for
+//#pragma omp parallel for
 		for (int y = 0; y < (int)dst.get_height(); y++)
 		{
 			for (uint32_t x = 0; x < dst.get_width(); x++)
diff --git a/basisu_tool.cpp b/basisu_tool.cpp
index 602111f..68b582f 100644
--- a/basisu_tool.cpp
+++ b/basisu_tool.cpp
@@ -22,15 +22,12 @@
 #include "basisu_comp.h"
 #include "transcoder/basisu_transcoder.h"
 #include "basisu_ssim.h"
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
 
 #define BASISU_CATCH_EXCEPTIONS 1
 
 using namespace basisu;
 
-#define BASISU_TOOL_VERSION "1.07.00"
+#define BASISU_TOOL_VERSION "1.08.00"
 
 enum tool_mode
 {
@@ -73,6 +70,7 @@
 		" -stats: Compute and display image quality metrics (slightly slower).\n"
 		" -tex_type <2d, 2darray, 3d, video, cubemap>: Set Basis file header's texture type field. Cubemap arrays require multiples of 6 images, in X+, X-, Y+, Y-, Z+, Z- order, each image must be the same resolutions.\n"
 		"  2d=arbitrary 2D images, 2darray=2D array, 3D=volume texture slices, video=video frames, cubemap=array of faces. For 2darray/3d/cubemaps/video, each source image's dimensions and # of mipmap levels must be the same.\n"
+		" For video, the .basis file will be written with the first frame being an I-Frame, and subsequent frames being P-Frames (using conditional replenishment). Playback must always occur in order from first to last image.\n"
 		" -framerate X: Set framerate in header to X/frames sec.\n"
 		" -individual: Process input images individually and output multiple .basis files (not as a texture array)\n"
 		" -fuzz_testing: Use with -validate: Disables CRC16 validation of file contents before transcoding\n"
@@ -85,9 +83,10 @@
 		" -no_alpha: Always output non-alpha basis files, even if one or more inputs has alpha\n"
 		" -force_alpha: Always output alpha basis files, even if no inputs has alpha\n"
 		" -seperate_rg_to_color_alpha: Seperate input R and G channels to RGB and A (for tangent space XY normal maps)\n"
-		" -no_multithreading: Disable OpenMP multithreading\n"
+		" -no_multithreading: Disable multithreading\n"
 		" -no_ktx: Disable KTX writing when unpacking (faster)\n"
 		" -etc1_only: Only unpack to ETC1, skipping the other texture formats during -unpack\n"
+		" -disable_hierarchical_endpoint_codebooks: Disable hierarchical endpoint codebook usage, slower but higher quality on some compression levels\n"
 		"\n"
 		"Mipmap generation options:\n"
 		" -mipmap: Generate mipmaps for each source image\n"
@@ -130,6 +129,9 @@
 		" basisu -linear -global_sel_pal -file x.png: Compress a non-sRGB image, use hybrid selector codebooks for slightly improved compression (but slower encoding)\n"
 		" basisu -tex_type video -framerate 20 -multifile_printf \"x%02u.png\" -multifile_first 1 -multifile_count 20 : Compress a 20 sRGB source image video sequence (x01.png, x02.png, x03.png, etc.) to x01.basis\n"
 		"\n"
+		"Note: For video use, it's recommended you use a very powerful machine with many cores. Use -slower for better codebook generation, specify very large codebooks using -max_endpoints and -max_selectors, and reduce\n"
+		"the default endpoint RDO threshold (-endpoint_rdo_thresh) to around 1.25. Videos may have mipmaps and alpha channels. Videos must always be played back by the transcoder in first to last image order.\n"
+		"Video files currently use I-Frames on the first image, and P-Frames using conditional replenishment on subsequent frames.\n"
 		"Compression level details:\n"
 		" Level 0: Fastest, but has marginal quality and is a work in progress. Brittle on complex images. Avg. Y dB: 35.45\n"
 		" Level 1: Hierarchical codebook searching. 36.87 dB, ~1.4x slower vs. level 0. (This is the default setting.)\n"
@@ -345,9 +347,7 @@
 				m_comp_params.m_seperate_rg_to_color_alpha = true;
 			else if (strcasecmp(pArg, "-no_multithreading") == 0)
 			{
-#if defined(_OPENMP)
-				omp_set_num_threads(1);
-#endif				
+				m_comp_params.m_multithreading = false;
 			}
 			else if (strcasecmp(pArg, "-mipmap") == 0)
 				m_comp_params.m_mip_gen = true;
@@ -355,6 +355,8 @@
 				m_no_ktx = true;
 			else if (strcasecmp(pArg, "-etc1_only") == 0)
 				m_etc1_only = true;
+			else if (strcasecmp(pArg, "-disable_hierarchical_endpoint_codebooks") == 0)
+				m_comp_params.m_disable_hierarchical_endpoint_codebooks = true;
 			else if (strcasecmp(pArg, "-mip_scale") == 0)
 			{
 				REMAINING_ARGS_CHECK(1);
@@ -612,6 +614,18 @@
 static bool compress_mode(command_line_params &opts)
 {
 	basist::etc1_global_selector_codebook sel_codebook(basist::g_global_selector_cb_size, basist::g_global_selector_cb);
+
+	uint32_t num_threads = 1;
+
+	if (opts.m_comp_params.m_multithreading)
+	{
+		num_threads = std::thread::hardware_concurrency();
+		if (num_threads < 1)
+			num_threads = 1;
+	}
+
+	job_pool jpool(num_threads);
+	opts.m_comp_params.m_pJob_pool = &jpool;
 		
 	if (!expand_multifile(opts))
 	{
@@ -643,7 +657,7 @@
 	}
 
 	printf("Processing %u total files\n", (uint32_t)opts.m_input_filenames.size());
-	
+				
 	for (size_t file_index = 0; file_index < (opts.m_individual ? opts.m_input_filenames.size() : 1U); file_index++)
 	{
 		if (opts.m_individual)
@@ -895,6 +909,21 @@
 				ii.m_num_blocks_x, ii.m_num_blocks_y, ii.m_first_slice_index, (uint32_t)ii.m_alpha_flag);
 		}
 
+		printf("\nSlice info:\n");
+		for (uint32_t i = 0; i < fileinfo.m_slice_info.size(); i++)
+		{
+			const basist::basisu_slice_info &sliceinfo = fileinfo.m_slice_info[i];
+			printf("%u: OrigWidthHeight: %ux%u, BlockDim: %ux%u, TotalBlocks: %u, Compressed size: %u, Image: %u, Level: %u, UnpackedCRC16: 0x%X, alpha: %u, iframe: %i\n",
+				i,
+				sliceinfo.m_orig_width, sliceinfo.m_orig_height,
+				sliceinfo.m_num_blocks_x, sliceinfo.m_num_blocks_y,
+				sliceinfo.m_total_blocks,
+				sliceinfo.m_compressed_size,
+				sliceinfo.m_image_index, sliceinfo.m_level_index,
+				sliceinfo.m_unpacked_slice_crc16,
+				(uint32_t)sliceinfo.m_alpha_flag,
+				(uint32_t)sliceinfo.m_iframe_flag);
+		}
 		printf("\n");
 
 		if (!dec.start_transcoding(&basis_data[0], (uint32_t)basis_data.size()))
@@ -925,6 +954,8 @@
 		}
 								
 		// Now transcode the file to all supported texture formats and save mipmapped KTX files
+		for (int format_iter = first_format; format_iter < last_format; format_iter++)
+		{
 		for (uint32_t image_index = 0; image_index < fileinfo.m_total_images; image_index++)
 		{
 			for (uint32_t level_index = 0; level_index < fileinfo.m_image_mipmap_levels[image_index]; level_index++)
@@ -937,8 +968,6 @@
 					return false;
 				}
 
-				for (int format_iter = first_format; format_iter < last_format; format_iter++)
-				{
 					const basist::transcoder_texture_format transcoder_tex_fmt = static_cast<basist::transcoder_texture_format>(format_iter);
 
 					if (transcoder_tex_fmt == basist::cTFPVRTC1_4_OPAQUE_ONLY)
@@ -1048,7 +1077,7 @@
 
 					if ((!opts.m_no_ktx) && (fileinfo.m_tex_type != basist::cBASISTexTypeCubemapArray))
 					{
-						std::string ktx_filename(base_filename + string_format("_transcoded_%s_%u.ktx", basist::basis_get_format_name(transcoder_tex_fmt), image_index));
+						std::string ktx_filename(base_filename + string_format("_transcoded_%s_%04u.ktx", basist::basis_get_format_name(transcoder_tex_fmt), image_index));
 						if (!write_compressed_texture_file(ktx_filename.c_str(), gi))
 						{
 							error_printf("Failed writing KTX file \"%s\"!\n", ktx_filename.c_str());
@@ -1075,7 +1104,11 @@
 						}
 						//u.crop(level_info.m_orig_width, level_info.m_orig_height);
 					
-						std::string rgb_filename(base_filename + string_format("_unpacked_rgb_%s_%u_%u.png", basist::basis_get_format_name(transcoder_tex_fmt), image_index, level_index));
+						std::string rgb_filename;
+						if (gi.size() > 1)
+							rgb_filename = base_filename + string_format("_unpacked_rgb_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index);
+						else
+							rgb_filename = base_filename + string_format("_unpacked_rgb_%s_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), image_index);
 						if (!save_png(rgb_filename, u, cImageSaveIgnoreAlpha))
 						{
 							error_printf("Failed writing to PNG file \"%s\"\n", rgb_filename.c_str());
@@ -1085,7 +1118,11 @@
 
 						if (basis_transcoder_format_has_alpha(transcoder_tex_fmt))
 						{
-							std::string a_filename(base_filename + string_format("_unpacked_a_%s_%u_%u.png", basist::basis_get_format_name(transcoder_tex_fmt), image_index, level_index));
+							std::string a_filename;
+							if (gi.size() > 1)
+								a_filename = base_filename + string_format("_unpacked_a_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index);
+							else
+								a_filename = base_filename + string_format("_unpacked_a_%s_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), image_index);
 							if (!save_png(a_filename, u, cImageSaveGrayscale, 3))
 							{
 								error_printf("Failed writing to PNG file \"%s\"\n", a_filename.c_str());
@@ -1190,8 +1227,7 @@
 
 	const int X = 2;
 
-#pragma omp parallel for
-	for (int y = 0; y < (int)a.get_height(); y++)
+	for (uint32_t y = 0; y < a.get_height(); y++)
 	{
 		for (uint32_t x = 0; x < a.get_width(); x++)
 		{
diff --git a/transcoder/basisu_file_headers.h b/transcoder/basisu_file_headers.h
index 124b7f8..4b3bcf3 100644
--- a/transcoder/basisu_file_headers.h
+++ b/transcoder/basisu_file_headers.h
@@ -21,6 +21,7 @@
 	enum basis_slice_desc_flags
 	{
 		cSliceDescFlagsIsAlphaData = 1,
+		cSliceDescFlagsFrameIsIFrame = 2			// Video only: Frame doesn't refer to previous frame (no usage of conditional replenishment pred symbols)
 	};
 
 #pragma pack(push)
diff --git a/transcoder/basisu_transcoder.cpp b/transcoder/basisu_transcoder.cpp
index 40896dc..a2ba411 100644
--- a/transcoder/basisu_transcoder.cpp
+++ b/transcoder/basisu_transcoder.cpp
@@ -47,16 +47,20 @@
 #define BASISD_WRITE_NEW_DXT1_TABLES			0
 #define BASISD_WRITE_NEW_ETC2_EAC_A8_TABLES	0
 
+#ifndef BASISD_ENABLE_DEBUG_FLAGS
+#define BASISD_ENABLE_DEBUG_FLAGS	0
+#endif
+
 namespace basisu
 {
-	static bool g_debug_printf;
+	bool g_debug_printf;
 
 	void enable_debug_printf(bool enabled)
 	{
 		g_debug_printf = enabled;
 	}
 
-	void debug_printf(const char *pFmt, ...)
+	void debug_printf(const char* pFmt, ...)
 	{
 #if BASISU_DEVEL_MESSAGES	
 		g_debug_printf = true;
@@ -73,14 +77,34 @@
 
 namespace basist
 {
-	#include "basisu_transcoder_tables_bc7_m6.inc"
-				
-	uint16_t crc16(const void *r, size_t size, uint16_t crc)
+#include "basisu_transcoder_tables_bc7_m6.inc"
+
+#if BASISD_ENABLE_DEBUG_FLAGS
+	static uint32_t g_debug_flags = 0;
+#endif
+
+	uint32_t get_debug_flags()
+	{
+#if BASISD_ENABLE_DEBUG_FLAGS
+		return g_debug_flags;
+#else
+		return 0;
+#endif
+	}
+
+	void set_debug_flags(uint32_t f)
+	{
+		(void)f;
+#if BASISD_ENABLE_DEBUG_FLAGS
+		g_debug_flags = f;
+#endif
+	}
+	uint16_t crc16(const void* r, size_t size, uint16_t crc)
 	{
 		crc = ~crc;
 
-		const uint8_t *p = reinterpret_cast<const uint8_t *>(r);
-		for ( ; size; --size)
+		const uint8_t* p = reinterpret_cast<const uint8_t*>(r);
+		for (; size; --size)
 		{
 			const uint16_t q = *p++ ^ (crc >> 8);
 			uint16_t k = (q >> 4) ^ q;
@@ -89,21 +113,21 @@
 
 		return static_cast<uint16_t>(~crc);
 	}
-		
+
 	const uint32_t g_global_selector_cb[] =
 #include "basisu_global_selector_cb.h"
 		;
 
 	const uint32_t g_global_selector_cb_size = sizeof(g_global_selector_cb) / sizeof(g_global_selector_cb[0]);
 
-	void etc1_global_selector_codebook::init(uint32_t N, const uint32_t *pEntries)
+	void etc1_global_selector_codebook::init(uint32_t N, const uint32_t* pEntries)
 	{
 		m_palette.resize(N);
 		for (uint32_t i = 0; i < N; i++)
 			m_palette[i].set_uint32(pEntries[i]);
 	}
 
-	void etc1_global_selector_codebook::print_code(FILE *pFile)
+	void etc1_global_selector_codebook::print_code(FILE* pFile)
 	{
 		fprintf(pFile, "{\n");
 		for (uint32_t i = 0; i < m_palette.size(); i++)
@@ -266,7 +290,7 @@
 			assert((x | y | val) < 4);
 			const uint32_t bit_index = x * 4 + y;
 
-			uint8_t * p = &m_bytes[7 - (bit_index >> 3)];
+			uint8_t* p = &m_bytes[7 - (bit_index >> 3)];
 
 			const uint32_t byte_bit_ofs = bit_index & 7;
 			const uint32_t mask = 1 << byte_bit_ofs;
@@ -291,7 +315,7 @@
 
 			const uint32_t bit_index = x * 4 + y;
 			const uint32_t byte_bit_ofs = bit_index & 7;
-			const uint8_t * p = &m_bytes[7 - (bit_index >> 3)];
+			const uint8_t* p = &m_bytes[7 - (bit_index >> 3)];
 			const uint32_t lsb = (p[0] >> byte_bit_ofs) & 1;
 			const uint32_t msb = (p[-2] >> byte_bit_ofs) & 1;
 			const uint32_t val = lsb | (msb << 1);
@@ -367,7 +391,7 @@
 			set_byte_bits(cETC1DeltaColor3BBitOffset, 3, c & 7);
 		}
 
-		void set_block_color4(const color32 & c0_unscaled, const color32 & c1_unscaled)
+		void set_block_color4(const color32& c0_unscaled, const color32& c1_unscaled)
 		{
 			set_diff_bit(false);
 
@@ -375,7 +399,7 @@
 			set_base4_color(1, pack_color4(c1_unscaled, false));
 		}
 
-		void set_block_color5(const color32 & c0_unscaled, const color32 & c1_unscaled)
+		void set_block_color5(const color32& c0_unscaled, const color32& c1_unscaled)
 		{
 			set_diff_bit(true);
 
@@ -388,7 +412,7 @@
 			set_delta3_color(pack_delta3(dr, dg, db));
 		}
 
-		bool set_block_color5_check(const color32 & c0_unscaled, const color32 & c1_unscaled)
+		bool set_block_color5_check(const color32& c0_unscaled, const color32& c1_unscaled)
 		{
 			set_diff_bit(true);
 
@@ -438,7 +462,7 @@
 			return (m_bytes[3] >> ofs) & 7;
 		}
 
-		static uint16_t pack_color4(const color32 & color, bool scaled, uint32_t bias = 127U)
+		static uint16_t pack_color4(const color32& color, bool scaled, uint32_t bias = 127U)
 		{
 			return pack_color4(color.r, color.g, color.b, scaled, bias);
 		}
@@ -459,7 +483,7 @@
 			return static_cast<uint16_t>(b | (g << 4U) | (r << 8U));
 		}
 
-		static uint16_t pack_color5(const color32 & color, bool scaled, uint32_t bias = 127U)
+		static uint16_t pack_color5(const color32& color, bool scaled, uint32_t bias = 127U)
 		{
 			return pack_color5(color.r, color.g, color.b, scaled, bias);
 		}
@@ -480,7 +504,7 @@
 			return static_cast<uint16_t>(b | (g << 5U) | (r << 10U));
 		}
 
-		uint16_t pack_delta3(const color32 & color)
+		uint16_t pack_delta3(const color32& color)
 		{
 			return pack_delta3(color.r, color.g, color.b);
 		}
@@ -512,7 +536,7 @@
 			return color32(r, g, b, alpha);
 		}
 
-		static void unpack_color5(uint32_t & r, uint32_t & g, uint32_t & b, uint16_t packed_color5, bool scaled)
+		static void unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, bool scaled)
 		{
 			color32 c(unpack_color5(packed_color5, scaled, 0));
 			r = c.r;
@@ -520,7 +544,7 @@
 			b = c.b;
 		}
 
-		static void get_diff_subblock_colors(color32 * pDst, uint16_t packed_color5, uint32_t table_idx)
+		static void get_diff_subblock_colors(color32* pDst, uint16_t packed_color5, uint32_t table_idx)
 		{
 			assert(table_idx < cETC1IntenModifierValues);
 			const int* pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
@@ -556,7 +580,7 @@
 			return x;
 		}
 
-		static void get_block_colors5(color32 * pBlock_colors, const color32 & base_color5, uint32_t inten_table)
+		static void get_block_colors5(color32* pBlock_colors, const color32& base_color5, uint32_t inten_table)
 		{
 			color32 b(base_color5);
 
@@ -571,8 +595,8 @@
 			pBlock_colors[2].set(clamp255(b.r + pInten_table[2]), clamp255(b.g + pInten_table[2]), clamp255(b.b + pInten_table[2]), 255);
 			pBlock_colors[3].set(clamp255(b.r + pInten_table[3]), clamp255(b.g + pInten_table[3]), clamp255(b.b + pInten_table[3]), 255);
 		}
-				
-		static void get_block_colors5_bounds(color32 * pBlock_colors, const color32 & base_color5, uint32_t inten_table, uint32_t l = 0, uint32_t h = 3)
+
+		static void get_block_colors5_bounds(color32* pBlock_colors, const color32& base_color5, uint32_t inten_table, uint32_t l = 0, uint32_t h = 3)
 		{
 			color32 b(base_color5);
 
@@ -589,8 +613,8 @@
 
 	enum dxt_constants
 	{
-		cDXT1SelectorBits = 2U,	cDXT1SelectorValues = 1U << cDXT1SelectorBits, cDXT1SelectorMask = cDXT1SelectorValues - 1U,
-		cDXT5SelectorBits = 3U,	cDXT5SelectorValues = 1U << cDXT5SelectorBits, cDXT5SelectorMask = cDXT5SelectorValues - 1U,
+		cDXT1SelectorBits = 2U, cDXT1SelectorValues = 1U << cDXT1SelectorBits, cDXT1SelectorMask = cDXT1SelectorValues - 1U,
+		cDXT5SelectorBits = 3U, cDXT5SelectorValues = 1U << cDXT5SelectorBits, cDXT5SelectorMask = cDXT5SelectorValues - 1U,
 	};
 
 	static const uint8_t g_etc1_x_selector_unpack[4][256] =
@@ -649,14 +673,14 @@
 
 		inline void clear() { basisu::clear_obj(*this); }
 
-		inline uint32_t get_high_color() const	{ return m_high_color[0] | (m_high_color[1] << 8U); }
+		inline uint32_t get_high_color() const { return m_high_color[0] | (m_high_color[1] << 8U); }
 		inline uint32_t get_low_color() const { return m_low_color[0] | (m_low_color[1] << 8U); }
 		inline void set_low_color(uint16_t c) { m_low_color[0] = static_cast<uint8_t>(c & 0xFF); m_low_color[1] = static_cast<uint8_t>((c >> 8) & 0xFF); }
 		inline void set_high_color(uint16_t c) { m_high_color[0] = static_cast<uint8_t>(c & 0xFF); m_high_color[1] = static_cast<uint8_t>((c >> 8) & 0xFF); }
 		inline uint32_t get_selector(uint32_t x, uint32_t y) const { assert((x < 4U) && (y < 4U)); return (m_selectors[y] >> (x * cDXT1SelectorBits)) & cDXT1SelectorMask; }
 		inline void set_selector(uint32_t x, uint32_t y, uint32_t val) { assert((x < 4U) && (y < 4U) && (val < 4U)); m_selectors[y] &= (~(cDXT1SelectorMask << (x * cDXT1SelectorBits))); m_selectors[y] |= (val << (x * cDXT1SelectorBits)); }
 
-		static uint16_t pack_color(const color32 &color, bool scaled, uint32_t bias = 127U)
+		static uint16_t pack_color(const color32& color, bool scaled, uint32_t bias = 127U)
 		{
 			uint32_t r = color.r, g = color.g, b = color.b;
 			if (scaled)
@@ -928,8 +952,8 @@
 	};
 	static bc1_match_entry g_bc1_match5_equals_1[256], g_bc1_match6_equals_1[256]; // selector 1, allow equals hi/lo
 	static bc1_match_entry g_bc1_match5_equals_0[256], g_bc1_match6_equals_0[256]; // selector 0, allow equals hi/lo
-	
-	static void prepare_bc1_single_color_table(bc1_match_entry *pTable, const uint8_t *pExpand, int size, int sel)
+
+	static void prepare_bc1_single_color_table(bc1_match_entry* pTable, const uint8_t* pExpand, int size, int sel)
 	{
 		int total_e = 0;
 
@@ -942,7 +966,7 @@
 				{
 					const int lo_e = pExpand[lo], hi_e = pExpand[hi];
 					int e;
-										
+
 					if (sel == 1)
 					{
 						// Selector 1
@@ -960,7 +984,7 @@
 					{
 						pTable[i].m_hi = static_cast<uint8_t>(hi);
 						pTable[i].m_lo = static_cast<uint8_t>(lo);
-						
+
 						lowest_e = e;
 					}
 
@@ -972,7 +996,7 @@
 	}
 #endif // BASISD_SUPPORT_DXT1
 
-	#if BASISD_WRITE_NEW_DXT1_TABLES
+#if BASISD_WRITE_NEW_DXT1_TABLES
 	static void create_etc1_to_dxt1_5_conversion_table()
 	{
 		FILE* pFile = nullptr;
@@ -1224,7 +1248,7 @@
 		std::vector<uint8_t> m_selectors_temp;
 	};
 
-	static uint64_t pack_eac_a8_exhaustive(pack_eac_a8_results & results, const uint8_t * pPixels, uint32_t num_pixels)
+	static uint64_t pack_eac_a8_exhaustive(pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels)
 	{
 		results.m_selectors.resize(num_pixels);
 		results.m_selectors_temp.resize(num_pixels);
@@ -1601,7 +1625,7 @@
 					pack_eac_a8_results pack_results;
 					pack_eac_a8_exhaustive(pack_results, pixels, num_pixels);
 
-					etc1_g_to_etc2_a8_conversion & c = s_etc1_g_to_etc2_a8[base + inten * 32][sel_range];
+					etc1_g_to_etc2_a8_conversion& c = s_etc1_g_to_etc2_a8[base + inten * 32][sel_range];
 
 					c.m_base = pack_results.m_base;
 					c.m_table_mul = pack_results.m_table * 16 + pack_results.m_multiplier;
@@ -1658,13 +1682,13 @@
 			bc1_expand5[i] = static_cast<uint8_t>((i << 3) | (i >> 2));
 		prepare_bc1_single_color_table(g_bc1_match5_equals_1, bc1_expand5, 32, 1);
 		prepare_bc1_single_color_table(g_bc1_match5_equals_0, bc1_expand5, 32, 0);
-			
+
 		uint8_t bc1_expand6[64];
 		for (int i = 0; i < 64; i++)
 			bc1_expand6[i] = static_cast<uint8_t>((i << 2) | (i >> 4));
 		prepare_bc1_single_color_table(g_bc1_match6_equals_1, bc1_expand6, 64, 1);
 		prepare_bc1_single_color_table(g_bc1_match6_equals_0, bc1_expand6, 64, 0);
-						
+
 		for (uint32_t i = 0; i < NUM_ETC1_TO_DXT1_SELECTOR_RANGES; i++)
 		{
 			uint32_t l = g_etc1_to_dxt1_selector_ranges[i].m_low;
@@ -1724,7 +1748,7 @@
 	}
 
 #if BASISD_SUPPORT_DXT1
-	static void convert_etc1s_to_dxt1(dxt1_block * pDst_block, const decoder_etc_block *pSrc_block, const selector * pSelector, bool use_threecolor_blocks)
+	static void convert_etc1s_to_dxt1(dxt1_block* pDst_block, const decoder_etc_block* pSrc_block, const selector* pSelector, bool use_threecolor_blocks)
 	{
 #if !BASISD_WRITE_NEW_DXT1_TABLES
 		const uint32_t low_selector = pSelector->m_lo_selector;
@@ -1756,7 +1780,7 @@
 				// Make l > h
 				if (min16 > 0)
 					min16--;
-				else 
+				else
 				{
 					// l = h = 0
 					assert(min16 == max16 && max16 == 0);
@@ -1765,7 +1789,7 @@
 					min16 = 0;
 					mask = 0x55;
 				}
-			
+
 				assert(max16 > min16);
 			}
 
@@ -1774,7 +1798,7 @@
 				std::swap(max16, min16);
 				mask ^= 0x55;
 			}
-						
+
 			pDst_block->set_low_color(static_cast<uint16_t>(max16));
 			pDst_block->set_high_color(static_cast<uint16_t>(min16));
 			pDst_block->m_selectors[0] = static_cast<uint8_t>(mask);
@@ -1810,28 +1834,28 @@
 				{
 					min16--;
 
-					l = 0; 
+					l = 0;
 					h = 0;
 				}
-				else 
+				else
 				{
 					// l = h = 0
 					assert(min16 == max16 && max16 == 0);
 
 					max16 = 1;
 					min16 = 0;
-					
+
 					l = 1;
 					h = 1;
 				}
-			
+
 				assert(max16 > min16);
 			}
 
 			if (max16 < min16)
 			{
 				std::swap(max16, min16);
-				l = 1; 
+				l = 1;
 				h = 0;
 			}
 
@@ -1854,9 +1878,9 @@
 		const uint32_t selector_range_table = g_etc1_to_dxt1_selector_range_index[low_selector][high_selector];
 
 		//[32][8][RANGES][MAPPING]
-		const etc1_to_dxt1_56_solution *pTable_r = &g_etc1_to_dxt_5[(inten_table * 32 + base_color.r) * (NUM_ETC1_TO_DXT1_SELECTOR_RANGES * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS) + selector_range_table * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS];
-		const etc1_to_dxt1_56_solution *pTable_g = &g_etc1_to_dxt_6[(inten_table * 32 + base_color.g) * (NUM_ETC1_TO_DXT1_SELECTOR_RANGES * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS) + selector_range_table * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS];
-		const etc1_to_dxt1_56_solution *pTable_b = &g_etc1_to_dxt_5[(inten_table * 32 + base_color.b) * (NUM_ETC1_TO_DXT1_SELECTOR_RANGES * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS) + selector_range_table * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS];
+		const etc1_to_dxt1_56_solution* pTable_r = &g_etc1_to_dxt_5[(inten_table * 32 + base_color.r) * (NUM_ETC1_TO_DXT1_SELECTOR_RANGES * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS) + selector_range_table * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS];
+		const etc1_to_dxt1_56_solution* pTable_g = &g_etc1_to_dxt_6[(inten_table * 32 + base_color.g) * (NUM_ETC1_TO_DXT1_SELECTOR_RANGES * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS) + selector_range_table * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS];
+		const etc1_to_dxt1_56_solution* pTable_b = &g_etc1_to_dxt_5[(inten_table * 32 + base_color.b) * (NUM_ETC1_TO_DXT1_SELECTOR_RANGES * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS) + selector_range_table * NUM_ETC1_TO_DXT1_SELECTOR_MAPPINGS];
 
 		uint32_t best_err = UINT_MAX;
 		uint32_t best_mapping = 0;
@@ -1870,7 +1894,7 @@
 		uint32_t l = dxt1_block::pack_unscaled_color(pTable_r[best_mapping].m_lo, pTable_g[best_mapping].m_lo, pTable_b[best_mapping].m_lo);
 		uint32_t h = dxt1_block::pack_unscaled_color(pTable_r[best_mapping].m_hi, pTable_g[best_mapping].m_hi, pTable_b[best_mapping].m_hi);
 
-		const uint8_t *pSelectors_xlat = &g_etc1_to_dxt1_selector_mappings1[best_mapping][0];
+		const uint8_t* pSelectors_xlat = &g_etc1_to_dxt1_selector_mappings1[best_mapping][0];
 
 		if (l < h)
 		{
@@ -1892,7 +1916,7 @@
 				// Make l > h
 				if (h > 0)
 					h--;
-				else 
+				else
 				{
 					// l = h = 0
 					assert(l == h && h == 0);
@@ -1906,7 +1930,7 @@
 				pDst_block->set_low_color(static_cast<uint16_t>(l));
 				pDst_block->set_high_color(static_cast<uint16_t>(h));
 			}
-			
+
 			pDst_block->m_selectors[0] = mask;
 			pDst_block->m_selectors[1] = mask;
 			pDst_block->m_selectors[2] = mask;
@@ -1980,6 +2004,25 @@
 		pDst_block->m_selectors[3] = (uint8_t)dxt1_sels3;
 #endif
 	}
+#if BASISD_ENABLE_DEBUG_FLAGS
+	static void convert_etc1s_to_dxt1_vis(dxt1_block* pDst_block, const decoder_etc_block* pSrc_block, const selector* pSelector, bool use_threecolor_blocks)
+	{
+		convert_etc1s_to_dxt1(pDst_block, pSrc_block, pSelector, use_threecolor_blocks);
+		if (g_debug_flags & cDebugFlagVisBC1Sels)
+		{
+			uint32_t l = dxt1_block::pack_unscaled_color(31, 63, 31);
+			uint32_t h = dxt1_block::pack_unscaled_color(0, 0, 0);
+			pDst_block->set_low_color(static_cast<uint16_t>(l));
+			pDst_block->set_high_color(static_cast<uint16_t>(h));
+		}
+		else if (g_debug_flags & cDebugFlagVisBC1Endpoints)
+		{
+			for (uint32_t y = 0; y < 4; y++)
+				for (uint32_t x = 0; x < 4; x++)
+					pDst_block->set_selector(x, y, (y < 2) ? 0 : 1);
+		}
+	}
+#endif
 #endif
 
 	static dxt_selector_range s_dxt5a_selector_ranges[] =
@@ -2354,7 +2397,7 @@
 			return 6;
 		}
 
-		static uint32_t get_block_values8(color32 * pDst, uint32_t l, uint32_t h)
+		static uint32_t get_block_values8(color32* pDst, uint32_t l, uint32_t h)
 		{
 			pDst[0].a = static_cast<uint8_t>(l);
 			pDst[1].a = static_cast<uint8_t>(h);
@@ -2367,7 +2410,7 @@
 			return 8;
 		}
 
-		static uint32_t get_block_values(color32 * pDst, uint32_t l, uint32_t h)
+		static uint32_t get_block_values(color32* pDst, uint32_t l, uint32_t h)
 		{
 			if (l > h)
 				return get_block_values8(pDst, l, h);
@@ -2376,7 +2419,7 @@
 		}
 	};
 
-	static void convert_etc1s_to_dxt5a(dxt5a_block *pDst_block, const decoder_etc_block *pSrc_block, const selector *pSelector)
+	static void convert_etc1s_to_dxt5a(dxt5a_block* pDst_block, const decoder_etc_block* pSrc_block, const selector* pSelector)
 	{
 		const uint32_t low_selector = pSelector->m_lo_selector;
 		const uint32_t high_selector = pSelector->m_hi_selector;
@@ -2434,7 +2477,7 @@
 		if (selector_range_table >= NUM_DXT5A_SELECTOR_RANGES)
 			selector_range_table = 0;
 
-		const etc1_g_to_dxt5a_conversion *pTable_entry = &g_etc1_g_to_dxt5a[base_color.r + inten_table * 32][selector_range_table];
+		const etc1_g_to_dxt5a_conversion* pTable_entry = &g_etc1_g_to_dxt5a[base_color.r + inten_table * 32][selector_range_table];
 
 		pDst_block->set_low_alpha(pTable_entry->m_lo);
 		pDst_block->set_high_alpha(pTable_entry->m_hi);
@@ -2649,7 +2692,7 @@
 		}
 
 		// accepts 5554 or 8888
-		inline void set_endpoint(uint32_t endpoint_index, const color32 & c, bool opaque_endpoint, bool pack = false, uint32_t pack_round = 128)
+		inline void set_endpoint(uint32_t endpoint_index, const color32& c, bool opaque_endpoint, bool pack = false, uint32_t pack_round = 128)
 		{
 			assert(endpoint_index < 2);
 			const uint32_t m = m_endpoints & 1;
@@ -2817,7 +2860,7 @@
 	}
 
 	// TODO: Support decoding a non-pow2 ETC1S texture into the next larger pow2 PVRTC texture.
-	static void fixup_pvrtc1_4_modulation(const decoder_etc_block *pETC_Blocks, const uint32_t *pPVRTC_endpoints, void *pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool pvrtc_wrap_addressing)
+	static void fixup_pvrtc1_4_modulation(const decoder_etc_block* pETC_Blocks, const uint32_t* pPVRTC_endpoints, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool pvrtc_wrap_addressing)
 	{
 		const uint32_t x_mask = num_blocks_x - 1;
 		const uint32_t y_mask = num_blocks_y - 1;
@@ -2874,15 +2917,14 @@
 						swizzled |= ((y >> min_bits) << (min_bits * 2));
 				}
 
-
-				pvrtc4_block *pDst_block = static_cast<pvrtc4_block*>(pDst_blocks) + swizzled;
+				pvrtc4_block* pDst_block = static_cast<pvrtc4_block*>(pDst_blocks) + swizzled;
 				pDst_block->m_endpoints = pPVRTC_endpoints[block_index];
 
 				uint32_t base_r = g_etc_5_to_8[src_block.m_differential.m_red1];
 				uint32_t base_g = g_etc_5_to_8[src_block.m_differential.m_green1];
 				uint32_t base_b = g_etc_5_to_8[src_block.m_differential.m_blue1];
 
-				const int *pInten_table48 = g_etc1_inten_tables48[src_block.m_differential.m_cw1];
+				const int* pInten_table48 = g_etc1_inten_tables48[src_block.m_differential.m_cw1];
 				int by = (base_r + base_g + base_b) * 16;
 				int block_colors_y_x16[4];
 				block_colors_y_x16[0] = by + pInten_table48[2];
@@ -3045,12 +3087,12 @@
 		};
 	};
 
-	static void convert_etc1s_to_bc7_m6(bc7_mode_6 * pDst_block, const decoder_etc_block * pSrc_block, const selector * pSelector)
+	static void convert_etc1s_to_bc7_m6(bc7_mode_6* pDst_block, const decoder_etc_block* pSrc_block, const selector* pSelector)
 	{
 #if !BASISD_WRITE_NEW_BC7_TABLES
 		const uint32_t low_selector = pSelector->m_lo_selector;
 		const uint32_t high_selector = pSelector->m_hi_selector;
-		
+
 		const uint32_t inten_table = pSrc_block->m_differential.m_cw1;
 		const uint32_t base_color_r = pSrc_block->m_differential.m_red1;
 		const uint32_t base_color_g = pSrc_block->m_differential.m_green1;
@@ -3079,7 +3121,7 @@
 			const uint32_t b1 = block_colors[high_selector].b;
 			const uint32_t low_bits1 = (r1 & 1) + (g1 & 1) + (b1 & 1);
 			uint32_t p1 = low_bits1 >= 2;
-															
+
 			pDst_block->m_lo.m_r0 = r0 >> 1;
 			pDst_block->m_lo.m_g0 = g0 >> 1;
 			pDst_block->m_lo.m_b0 = b0 >> 1;
@@ -3088,7 +3130,7 @@
 			pDst_block->m_lo.m_r1 = r1 >> 1;
 			pDst_block->m_lo.m_g1 = g1 >> 1;
 			pDst_block->m_lo.m_b1 = b1 >> 1;
-						
+
 			uint32_t output_low_selector = 0;
 			uint32_t output_bit_offset = 1;
 			uint64_t output_hi_bits = p1;
@@ -3099,7 +3141,7 @@
 				{
 					uint32_t s = pSrc_block->get_selector(x, y);
 					uint32_t os = (s == low_selector) ? output_low_selector : (15 ^ output_low_selector);
-					
+
 					uint32_t num_bits = 4;
 
 					if ((x | y) == 0)
@@ -3118,7 +3160,7 @@
 							output_hi_bits &= ~1ULL;
 							output_hi_bits |= p0;
 							std::swap(p0, p1);
-												
+
 							output_low_selector = 15;
 							os = 0;
 						}
@@ -3132,12 +3174,12 @@
 			}
 
 			pDst_block->m_hi_bits = output_hi_bits;
-			
+
 			assert(pDst_block->m_hi.m_p1 == p1);
-									
+
 			return;
 		}
-				
+
 		uint32_t selector_range_table = g_etc1_to_bc7_m6_selector_range_index[low_selector][high_selector];
 
 		const uint32_t* pTable_r = g_etc1_to_bc7_m6_table[base_color_r + inten_table * 32] + (selector_range_table * NUM_ETC1_TO_BC7_M6_SELECTOR_MAPPINGS);
@@ -3253,7 +3295,7 @@
 #endif
 
 #if BASISD_SUPPORT_ETC2_EAC_A8
-	static void convert_etc1s_to_etc2_eac_a8(eac_a8_block * pDst_block, const decoder_etc_block * pSrc_block, const selector * pSelector)
+	static void convert_etc1s_to_etc2_eac_a8(eac_a8_block* pDst_block, const decoder_etc_block* pSrc_block, const selector* pSelector)
 	{
 		const uint32_t low_selector = pSelector->m_lo_selector;
 		const uint32_t high_selector = pSelector->m_hi_selector;
@@ -3289,7 +3331,7 @@
 		if (selector_range_table >= NUM_ETC2_EAC_A8_SELECTOR_RANGES)
 			selector_range_table = 0;
 
-		const etc1_g_to_etc2_a8_conversion *pTable_entry = &s_etc1_g_to_etc2_a8[base_color.r + inten_table * 32][selector_range_table];
+		const etc1_g_to_etc2_a8_conversion* pTable_entry = &s_etc1_g_to_etc2_a8[base_color.r + inten_table * 32][selector_range_table];
 
 		pDst_block->m_base = pTable_entry->m_base;
 		pDst_block->m_table = pTable_entry->m_table_mul >> 4;
@@ -3315,15 +3357,15 @@
 	}
 #endif // BASISD_SUPPORT_ETC2_EAC_A8
 
-	basisu_lowlevel_transcoder::basisu_lowlevel_transcoder(const etc1_global_selector_codebook * pGlobal_sel_codebook) :
+	basisu_lowlevel_transcoder::basisu_lowlevel_transcoder(const etc1_global_selector_codebook* pGlobal_sel_codebook) :
 		m_pGlobal_sel_codebook(pGlobal_sel_codebook),
 		m_selector_history_buf_size(0)
 	{
 	}
 
 	bool basisu_lowlevel_transcoder::decode_palettes(
-		uint32_t num_endpoints, const uint8_t * pEndpoints_data, uint32_t endpoints_data_size,
-		uint32_t num_selectors, const uint8_t * pSelectors_data, uint32_t selectors_data_size)
+		uint32_t num_endpoints, const uint8_t* pEndpoints_data, uint32_t endpoints_data_size,
+		uint32_t num_selectors, const uint8_t* pSelectors_data, uint32_t selectors_data_size)
 	{
 		bitwise_decoder sym_codec;
 
@@ -3331,53 +3373,53 @@
 
 		if (!sym_codec.init(pEndpoints_data, endpoints_data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 0\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 0\n");
 			return false;
 		}
-				
+
 		if (!sym_codec.read_huffman_table(color5_delta_model0))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 1\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 1\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(color5_delta_model1))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 1a\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 1a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(color5_delta_model2))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2a\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2a\n");
 			return false;
 		}
-				
+
 		if (!sym_codec.read_huffman_table(inten_delta_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2b\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2b\n");
 			return false;
 		}
 
 		if (!color5_delta_model0.is_valid() || !color5_delta_model1.is_valid() || !color5_delta_model2.is_valid() || !inten_delta_model.is_valid())
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2b\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 2b\n");
 			return false;
 		}
 
 		const bool endpoints_are_grayscale = sym_codec.get_bits(1) != 0;
-		
+
 		m_endpoints.resize(num_endpoints);
 
 		color32 prev_color5(16, 16, 16, 0);
 		uint32_t prev_inten = 0;
-		
+
 		for (uint32_t i = 0; i < num_endpoints; i++)
 		{
 			uint32_t inten_delta = sym_codec.decode_huffman(inten_delta_model);
 			m_endpoints[i].m_inten5 = static_cast<uint8_t>((inten_delta + prev_inten) & 7);
 			prev_inten = m_endpoints[i].m_inten5;
-			
+
 			for (uint32_t c = 0; c < (endpoints_are_grayscale ? 1U : 3U); c++)
 			{
 				int delta;
@@ -3389,7 +3431,7 @@
 					delta = sym_codec.decode_huffman(color5_delta_model2);
 
 				int v = (prev_color5[c] + delta) & 31;
-				
+
 				m_endpoints[i].m_color5[c] = static_cast<uint8_t>(v);
 
 				prev_color5[c] = static_cast<uint8_t>(v);
@@ -3401,14 +3443,14 @@
 				m_endpoints[i].m_color5[2] = m_endpoints[i].m_color5[0];
 			}
 		}
-				
+
 		sym_codec.stop();
 
 		m_selectors.resize(num_selectors);
 
 		if (!sym_codec.init(pSelectors_data, selectors_data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 5\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 5\n");
 			return false;
 		}
 
@@ -3427,16 +3469,16 @@
 			{
 				if (!sym_codec.read_huffman_table(mod_model))
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 6\n");		
+					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 6\n");
 					return false;
 				}
 				if (!mod_model.is_valid())
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 6a\n");		
+					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 6a\n");
 					return false;
 				}
 			}
-						
+
 			for (uint32_t i = 0; i < num_selectors; i++)
 			{
 				uint32_t pal_index = 0;
@@ -3446,10 +3488,10 @@
 				uint32_t mod_index = 0;
 				if (mod_bits)
 					mod_index = sym_codec.decode_huffman(mod_model);
-					
+
 				if (pal_index >= m_pGlobal_sel_codebook->size())
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7z\n");		
+					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7z\n");
 					return false;
 				}
 
@@ -3459,7 +3501,7 @@
 				for (uint32_t y = 0; y < 4; y++)
 					for (uint32_t x = 0; x < 4; x++)
 						m_selectors[i].set_selector(x, y, e[x + y * 4]);
-								
+
 				m_selectors[i].init_flags();
 			}
 		}
@@ -3475,12 +3517,12 @@
 				basist::huffman_decoding_table uses_global_cb_bitflags_model;
 				if (!sym_codec.read_huffman_table(uses_global_cb_bitflags_model))
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7\n");		
+					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7\n");
 					return false;
 				}
 				if (!uses_global_cb_bitflags_model.is_valid())
 				{
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7a\n");		
+					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 7a\n");
 					return false;
 				}
 
@@ -3489,19 +3531,19 @@
 				{
 					if (!sym_codec.read_huffman_table(global_mod_indices_model))
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8\n");		
+						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8\n");
 						return false;
 					}
 					if (!global_mod_indices_model.is_valid())
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8a\n");		
+						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8a\n");
 						return false;
 					}
 				}
 
 				uint32_t cur_uses_global_cb_bitflags = 0;
 				uint32_t uses_global_cb_bitflags_remaining = 0;
-								
+
 				for (uint32_t q = 0; q < num_selectors; q++)
 				{
 					if (!uses_global_cb_bitflags_remaining)
@@ -3522,7 +3564,7 @@
 
 						if (pal_index >= m_pGlobal_sel_codebook->size())
 						{
-							BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8b\n");		
+							BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 8b\n");
 							return false;
 						}
 
@@ -3569,18 +3611,18 @@
 				{
 					if (!sym_codec.read_huffman_table(delta_selector_pal_model))
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 10\n");		
+						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 10\n");
 						return false;
 					}
 
 					if ((num_selectors > 1) && (!delta_selector_pal_model.is_valid()))
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 10a\n");		
+						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_palettes: fail 10a\n");
 						return false;
 					}
 
 					uint8_t prev_bytes[4] = { 0, 0, 0, 0 };
-					
+
 					for (uint32_t i = 0; i < num_selectors; i++)
 					{
 						if (!i)
@@ -3589,7 +3631,7 @@
 							{
 								uint32_t cur_byte = sym_codec.get_bits(8);
 								prev_bytes[j] = static_cast<uint8_t>(cur_byte);
-								
+
 								for (uint32_t k = 0; k < 4; k++)
 									m_selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
 							}
@@ -3618,124 +3660,144 @@
 		return true;
 	}
 
-	bool basisu_lowlevel_transcoder::decode_tables(const uint8_t * pTable_data, uint32_t table_data_size)
+	bool basisu_lowlevel_transcoder::decode_tables(const uint8_t* pTable_data, uint32_t table_data_size)
 	{
 		basist::bitwise_decoder sym_codec;
 		if (!sym_codec.init(pTable_data, table_data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 0\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 0\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(m_endpoint_pred_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 1\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 1\n");
 			return false;
 		}
-		
+
 		if (m_endpoint_pred_model.get_code_sizes().size() == 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 1a\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 1a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(m_delta_endpoint_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 2\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 2\n");
 			return false;
 		}
 
 		if (m_delta_endpoint_model.get_code_sizes().size() == 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 2a\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 2a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(m_selector_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 3\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 3\n");
 			return false;
 		}
 
 		if (m_selector_model.get_code_sizes().size() == 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 3a\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 3a\n");
 			return false;
 		}
 
 		if (!sym_codec.read_huffman_table(m_selector_history_buf_rle_model))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 4\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 4\n");
 			return false;
 		}
 
 		if (m_selector_history_buf_rle_model.get_code_sizes().size() == 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 4a\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::decode_tables: fail 4a\n");
 			return false;
 		}
 
 		m_selector_history_buf_size = sym_codec.get_bits(13);
-				
+
 		sym_codec.stop();
 
 		return true;
 	}
-			
-	bool basisu_lowlevel_transcoder::transcode_slice(void *pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t *pImage_data, uint32_t image_data_size, block_format fmt, 
-		uint32_t output_block_stride_in_bytes, bool pvrtc_wrap_addressing, bool bc1_allow_threecolor_blocks, uint32_t output_row_pitch_in_blocks)
+
+	bool basisu_lowlevel_transcoder::transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+		uint32_t output_block_stride_in_bytes, bool pvrtc_wrap_addressing, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks, basisu_transcoder_state* pState)
 	{
+		if (!pState)
+			pState = &m_def_state;
+
+		const bool is_video = (header.m_tex_type == cBASISTexTypeVideoFrames);
 		const uint32_t total_blocks = num_blocks_x * num_blocks_y;
 
 		if (!output_row_pitch_in_blocks)
 			output_row_pitch_in_blocks = num_blocks_x;
+		
+		std::vector<uint32_t>* pPrev_frame_indices = nullptr;
+		if (is_video)
+		{
+			// TODO: Add check to make sure the caller hasn't tried skipping past p-frames
+			const bool alpha_flag = (slice_desc.m_flags & cSliceDescFlagsIsAlphaData) != 0;
+			const uint32_t level_index = slice_desc.m_level_index;
+
+			if (level_index >= basisu_transcoder_state::cMaxPrevFrameLevels)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: unsupported level_index\n");
+				return false;
+			}
+
+			pPrev_frame_indices = &pState->m_prev_frame_indices[alpha_flag][level_index];
+			if (pPrev_frame_indices->size() < total_blocks)
+				pPrev_frame_indices->resize(total_blocks);
+		}
 
 		basist::bitwise_decoder sym_codec;
-				
+
 		if (!sym_codec.init(pImage_data, image_data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: sym_codec.init failed\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: sym_codec.init failed\n");
 			return false;
 		}
-		
+
 		approx_move_to_front selector_history_buf(m_selector_history_buf_size);
-				
-		int prev_selector_index = 0;
 
 		const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = (uint32_t)m_selectors.size();
 		const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = m_selector_history_buf_size + SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX;
 		uint32_t cur_selector_rle_count = 0;
-								
+
 		decoder_etc_block block;
 		memset(&block, 0, sizeof(block));
 
 		block.set_flip_bit(true);
 		block.set_diff_bit(true);
 
-		void *pPVRTC_work_mem = nullptr;
-		uint32_t *pPVRTC_endpoints = nullptr;
+		void* pPVRTC_work_mem = nullptr;
+		uint32_t* pPVRTC_endpoints = nullptr;
 		if (fmt == cPVRTC1_4_OPAQUE_ONLY)
 		{
 			pPVRTC_work_mem = malloc(num_blocks_x * num_blocks_y * (sizeof(decoder_etc_block) + sizeof(uint32_t)));
 			if (!pPVRTC_work_mem)
 			{
-				BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: malloc failed\n");		
+				BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: malloc failed\n");
 				return false;
 			}
-			pPVRTC_endpoints = (uint32_t *)&((decoder_etc_block*)pPVRTC_work_mem)[num_blocks_x * num_blocks_y];
+			pPVRTC_endpoints = (uint32_t*) & ((decoder_etc_block*)pPVRTC_work_mem)[num_blocks_x * num_blocks_y];
 		}
-		
-		if (m_block_endpoint_preds[0].size() < num_blocks_x)
+
+		if (pState->m_block_endpoint_preds[0].size() < num_blocks_x)
 		{
-			m_block_endpoint_preds[0].resize(num_blocks_x);
-			m_block_endpoint_preds[1].resize(num_blocks_x);
+			pState->m_block_endpoint_preds[0].resize(num_blocks_x);
+			pState->m_block_endpoint_preds[1].resize(num_blocks_x);
 		}
 
 		uint32_t cur_pred_bits = 0;
 		int prev_endpoint_pred_sym = 0;
 		int endpoint_pred_repeat_count = 0;
 		uint32_t prev_endpoint_index = 0;
-			
+
 		for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 		{
 			const uint32_t cur_block_endpoint_pred_array = block_y & 1;
@@ -3758,7 +3820,7 @@
 							if (cur_pred_bits == ENDPOINT_PRED_REPEAT_LAST_SYMBOL)
 							{
 								endpoint_pred_repeat_count = sym_codec.decode_vlc(ENDPOINT_PRED_COUNT_VLC_BITS) + ENDPOINT_PRED_MIN_REPEAT_COUNT - 1;
-								  
+
 								cur_pred_bits = prev_endpoint_pred_sym;
 							}
 							else
@@ -3767,16 +3829,16 @@
 							}
 						}
 
-						m_block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x].m_pred_bits = (uint8_t)(cur_pred_bits >> 4);
-					 }
-					 else
-					 {
-						 cur_pred_bits = m_block_endpoint_preds[cur_block_endpoint_pred_array][block_x].m_pred_bits;
-					 }
+						pState->m_block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x].m_pred_bits = (uint8_t)(cur_pred_bits >> 4);
+					}
+					else
+					{
+						cur_pred_bits = pState->m_block_endpoint_preds[cur_block_endpoint_pred_array][block_x].m_pred_bits;
+					}
 				}
 
 				// Decode endpoint index
-				uint32_t endpoint_index;
+				uint32_t endpoint_index, selector_index = 0;
 
 				const uint32_t pred = cur_pred_bits & 3;
 				cur_pred_bits >>= 2;
@@ -3786,7 +3848,7 @@
 					// Left
 					if (!block_x)
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (0)\n");		
+						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (0)\n");
 						if (pPVRTC_work_mem)
 							free(pPVRTC_work_mem);
 						return false;
@@ -3799,26 +3861,36 @@
 					// Upper
 					if (!block_y)
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (1)\n");		
+						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (1)\n");
 						if (pPVRTC_work_mem)
 							free(pPVRTC_work_mem);
 						return false;
 					}
 
-					endpoint_index = m_block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x].m_endpoint_index;
+					endpoint_index = pState->m_block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x].m_endpoint_index;
 				}
 				else if (pred == 2)
 				{
-					// Upper left
-					if ((!block_x) || (!block_y))
+					if (is_video)
 					{
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (2)\n");		
-						if (pPVRTC_work_mem)
-							free(pPVRTC_work_mem);
-						return false;
+						assert(pred == CR_ENDPOINT_PRED_INDEX);
+						endpoint_index = (*pPrev_frame_indices)[block_x + block_y * num_blocks_x];
+						selector_index = endpoint_index >> 16;
+						endpoint_index &= 0xFFFFU;
 					}
+					else
+					{
+						// Upper left
+						if ((!block_x) || (!block_y))
+						{
+							BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (2)\n");
+							if (pPVRTC_work_mem)
+								free(pPVRTC_work_mem);
+							return false;
+						}
 
-					endpoint_index = m_block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x - 1].m_endpoint_index;
+						endpoint_index = pState->m_block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x - 1].m_endpoint_index;
+					}
 				}
 				else
 				{
@@ -3830,94 +3902,109 @@
 						endpoint_index -= (int)m_endpoints.size();
 				}
 
-				m_block_endpoint_preds[cur_block_endpoint_pred_array][block_x].m_endpoint_index = (uint16_t)endpoint_index;
+				pState->m_block_endpoint_preds[cur_block_endpoint_pred_array][block_x].m_endpoint_index = (uint16_t)endpoint_index;
 
 				prev_endpoint_index = endpoint_index;
-				
+
 				// Decode selector index
-				uint32_t selector_index;
-				int selector_sym;
-				if (cur_selector_rle_count > 0)
+				if ((!is_video) || (pred != CR_ENDPOINT_PRED_INDEX))
 				{
-					cur_selector_rle_count--;
-
-					selector_sym = (int)m_selectors.size();
-				}
-				else
-				{
-					selector_sym = sym_codec.decode_huffman(m_selector_model);
-
-					if (selector_sym == static_cast<int>(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX))
+					int selector_sym;
+					if (cur_selector_rle_count > 0)
 					{
-						int run_sym = sym_codec.decode_huffman(m_selector_history_buf_rle_model);
+						cur_selector_rle_count--;
 
-						if (run_sym == (SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1))
-							cur_selector_rle_count = sym_codec.decode_vlc(7) + SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
-						else
-							cur_selector_rle_count = run_sym + SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+						selector_sym = (int)m_selectors.size();
+					}
+					else
+					{
+						selector_sym = sym_codec.decode_huffman(m_selector_model);
 
-						if (cur_selector_rle_count > total_blocks)
+						if (selector_sym == static_cast<int>(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX))
+						{
+							int run_sym = sym_codec.decode_huffman(m_selector_history_buf_rle_model);
+
+							if (run_sym == (SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1))
+								cur_selector_rle_count = sym_codec.decode_vlc(7) + SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+							else
+								cur_selector_rle_count = run_sym + SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+
+							if (cur_selector_rle_count > total_blocks)
+							{
+								// The file is corrupted or we've got a bug.
+								BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (3)\n");
+								if (pPVRTC_work_mem)
+									free(pPVRTC_work_mem);
+								return false;
+							}
+
+							selector_sym = (int)m_selectors.size();
+
+							cur_selector_rle_count--;
+						}
+					}
+
+					if (selector_sym >= (int)m_selectors.size())
+					{
+						assert(m_selector_history_buf_size > 0);
+
+						int history_buf_index = selector_sym - (int)m_selectors.size();
+
+						if (history_buf_index >= (int)selector_history_buf.size())
 						{
 							// The file is corrupted or we've got a bug.
-							BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (3)\n");		
+							BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (4)\n");
 							if (pPVRTC_work_mem)
 								free(pPVRTC_work_mem);
 							return false;
 						}
 
-						selector_sym = (int)m_selectors.size();
+						selector_index = selector_history_buf[history_buf_index];
 
-						cur_selector_rle_count--;
+						if (history_buf_index != 0)
+							selector_history_buf.use(history_buf_index);
 					}
-				}
-
-				if (selector_sym >= (int)m_selectors.size())
-				{
-					assert(m_selector_history_buf_size > 0);
-
-					int history_buf_index = selector_sym - (int)m_selectors.size();
-
-					if (history_buf_index >= (int)selector_history_buf.size())
+					else
 					{
-						// The file is corrupted or we've got a bug.
-						BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (4)\n");		
-						if (pPVRTC_work_mem)
-							free(pPVRTC_work_mem);
-						return false;
+						selector_index = selector_sym;
+
+						if (m_selector_history_buf_size)
+							selector_history_buf.add(selector_index);
 					}
-
-					selector_index = selector_history_buf[history_buf_index];
-
-					if (history_buf_index != 0)
-						selector_history_buf.use(history_buf_index);
 				}
-				else
-				{
-					selector_index = selector_sym;
-
-					if (m_selector_history_buf_size)
-						selector_history_buf.add(selector_index);
-				}
-
-				prev_selector_index = selector_index;
 
 				if ((endpoint_index >= m_endpoints.size()) || (selector_index >= m_selectors.size()))
 				{
 					// The file is corrupted or we've got a bug.
-					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (5)\n");		
+					BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: invalid datastream (5)\n");
 					if (pPVRTC_work_mem)
 						free(pPVRTC_work_mem);
 					return false;
 				}
 
-				const endpoint *pEndpoint0 = &m_endpoints[endpoint_index];
-												
+				if (is_video)
+					(*pPrev_frame_indices)[block_x + block_y * num_blocks_x] = endpoint_index | (selector_index << 16);
+
+#if BASISD_ENABLE_DEBUG_FLAGS
+				if ((g_debug_flags & cDebugFlagVisCRs) && ((fmt == cETC1) || (fmt == cBC1)))
+				{
+					if ((is_video) && (pred == 2))
+					{
+						decoder_etc_block* pDst_block = reinterpret_cast<decoder_etc_block*>(static_cast<uint8_t*>(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks) * output_block_stride_in_bytes);
+						memset(pDst_block, 0xFF, 8);
+						continue;
+					}
+				}
+#endif
+
+				const endpoint* pEndpoint0 = &m_endpoints[endpoint_index];
+
 				block.set_base5_color(decoder_etc_block::pack_color5(pEndpoint0->m_color5, false));
 
 				block.set_inten_table(0, pEndpoint0->m_inten5);
 				block.set_inten_table(1, pEndpoint0->m_inten5);
 
-				const selector *pSelector = &m_selectors[selector_index];
+				const selector* pSelector = &m_selectors[selector_index];
 
 				switch (fmt)
 				{
@@ -3935,7 +4022,12 @@
 
 					void* pDst_block = static_cast<uint8_t*>(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks) * output_block_stride_in_bytes;
 #if BASISD_SUPPORT_DXT1
-					convert_etc1s_to_dxt1(static_cast<dxt1_block*>(pDst_block), &block, pSelector, bc1_allow_threecolor_blocks);
+#if BASISD_ENABLE_DEBUG_FLAGS
+					if (g_debug_flags & (cDebugFlagVisBC1Sels | cDebugFlagVisBC1Endpoints))
+						convert_etc1s_to_dxt1_vis(static_cast<dxt1_block*>(pDst_block), &block, pSelector, bc1_allow_threecolor_blocks);
+					else
+#endif
+						convert_etc1s_to_dxt1(static_cast<dxt1_block*>(pDst_block), &block, pSelector, bc1_allow_threecolor_blocks);
 #else
 					assert(0);
 #endif
@@ -3991,7 +4083,7 @@
 				{
 #if BASISD_SUPPORT_BC7
 					block.set_raw_selector_bits(pSelector->m_bytes[0], pSelector->m_bytes[1], pSelector->m_bytes[2], pSelector->m_bytes[3]);
-					
+
 					void* pDst_block = static_cast<uint8_t*>(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks) * output_block_stride_in_bytes;
 					convert_etc1s_to_bc7_m6(static_cast<bc7_mode_6*>(pDst_block), &block, pSelector);
 #else	
@@ -4023,12 +4115,12 @@
 
 		if (endpoint_pred_repeat_count != 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: endpoint_pred_repeat_count != 0. The file is corrupted or this is a bug\n");		
+			BASISU_DEVEL_ERROR("basisu_lowlevel_transcoder::transcode_slice: endpoint_pred_repeat_count != 0. The file is corrupted or this is a bug\n");
 			return false;
 		}
 
 		//assert(endpoint_pred_repeat_count == 0);
-		
+
 		if (fmt == cPVRTC1_4_OPAQUE_ONLY)
 		{
 			// PVRTC post process - create per-pixel modulation values.
@@ -4043,9 +4135,7 @@
 		return true;
 	}
 
-	basisu_transcoder::basisu_transcoder(const etc1_global_selector_codebook * pGlobal_sel_codebook) :
-		m_pFile_data(NULL),
-		m_file_data_size(0),
+	basisu_transcoder::basisu_transcoder(const etc1_global_selector_codebook* pGlobal_sel_codebook) :
 		m_lowlevel_decoder(pGlobal_sel_codebook)
 	{
 	}
@@ -4055,12 +4145,12 @@
 		if (!validate_header(pData, data_size))
 			return false;
 
-		const basis_file_header *pHeader = reinterpret_cast<const basis_file_header*>(pData);
+		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
 
 #if !BASISU_NO_HEADER_OR_DATA_CRC16_CHECKS
 		if (crc16(&pHeader->m_data_size, sizeof(basis_file_header) - BASISU_OFFSETOF(basis_file_header, m_data_size), 0) != pHeader->m_header_crc16)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header CRC check failed\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header CRC check failed\n");
 			return false;
 		}
 
@@ -4068,7 +4158,7 @@
 		{
 			if (crc16(reinterpret_cast<const uint8_t*>(pData) + sizeof(basis_file_header), pHeader->m_data_size, 0) != pHeader->m_data_crc16)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: data CRC check failed\n");		
+				BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: data CRC check failed\n");
 				return false;
 			}
 		}
@@ -4076,24 +4166,24 @@
 
 		return true;
 	}
-	
+
 	bool basisu_transcoder::validate_header_quick(const void* pData, uint32_t data_size) const
 	{
 		if (data_size <= sizeof(basis_file_header))
 			return false;
 
-		const basis_file_header *pHeader = reinterpret_cast<const basis_file_header*>(pData);
+		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
 
 		if ((pHeader->m_sig != basis_file_header::cBASISSigValue) || (pHeader->m_ver != BASISD_SUPPORTED_BASIS_VERSION) || (pHeader->m_header_size != sizeof(basis_file_header)))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header has an invalid signature, or file version is unsupported\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header has an invalid signature, or file version is unsupported\n");
 			return false;
 		}
 
 		uint32_t expected_file_size = sizeof(basis_file_header) + pHeader->m_data_size;
 		if (data_size < expected_file_size)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: source buffer is too small\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: source buffer is too small\n");
 			return false;
 		}
 
@@ -4103,14 +4193,14 @@
 			return false;
 		}
 
-		if ( (pHeader->m_slice_desc_file_ofs >= data_size) ||
-			  ((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices))
+		if ((pHeader->m_slice_desc_file_ofs >= data_size) ||
+			((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices))
 			)
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: passed in buffer is too small or data is corrupted\n");
 			return false;
 		}
-							
+
 		return true;
 	}
 
@@ -4118,34 +4208,34 @@
 	{
 		if (data_size <= sizeof(basis_file_header))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: input source buffer is too small\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: input source buffer is too small\n");
 			return false;
 		}
 
-		const basis_file_header *pHeader = reinterpret_cast<const basis_file_header*>(pData);
+		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
 
 		if ((pHeader->m_sig != basis_file_header::cBASISSigValue) || (pHeader->m_ver != BASISD_SUPPORTED_BASIS_VERSION) || (pHeader->m_header_size != sizeof(basis_file_header)))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header has an invalid signature, or file version is unsupported\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header has an invalid signature, or file version is unsupported\n");
 			return false;
 		}
 
 		uint32_t expected_file_size = sizeof(basis_file_header) + pHeader->m_data_size;
 		if (data_size < expected_file_size)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: input source buffer is too small, or header is corrupted\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: input source buffer is too small, or header is corrupted\n");
 			return false;
 		}
 
 		if ((!pHeader->m_total_images) || (!pHeader->m_total_slices))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (total images or slices are 0)\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (total images or slices are 0)\n");
 			return false;
 		}
 
 		if (pHeader->m_total_images > pHeader->m_total_slices)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (too many images)\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (too many images)\n");
 			return false;
 		}
 
@@ -4153,7 +4243,7 @@
 		{
 			if (pHeader->m_total_slices & 1)
 			{
-				BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid alpha basis file\n");		
+				BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid alpha basis file\n");
 				return false;
 			}
 		}
@@ -4161,12 +4251,12 @@
 		if ((pHeader->m_flags & cBASISHeaderFlagETC1S) == 0)
 		{
 			// We only support ETC1S in basis universal
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (ETC1S flag check)\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (ETC1S flag check)\n");
 			return false;
 		}
 
-		if ( (pHeader->m_slice_desc_file_ofs >= data_size) ||
-			  ((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices))
+		if ((pHeader->m_slice_desc_file_ofs >= data_size) ||
+			((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices))
 			)
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: passed in buffer is too small or data is corrupted\n");
@@ -4176,18 +4266,18 @@
 		return true;
 	}
 
-	basis_texture_type basisu_transcoder::get_texture_type(const void *pData, uint32_t data_size) const
+	basis_texture_type basisu_transcoder::get_texture_type(const void* pData, uint32_t data_size) const
 	{
 		if (!validate_header_quick(pData, data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_texture_type: header validation failed\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_texture_type: header validation failed\n");
 			return cBASISTexType2DArray;
 		}
 
-		const basis_file_header *pHeader = static_cast<const basis_file_header *>(pData);
+		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
 
 		basis_texture_type btt = static_cast<basis_texture_type>(static_cast<uint8_t>(pHeader->m_tex_type));
-		
+
 		if (btt >= cBASISTexTypeTotal)
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: header's texture type field is invalid\n");
@@ -4197,77 +4287,78 @@
 		return btt;
 	}
 
-	bool basisu_transcoder::get_userdata(const void *pData, uint32_t data_size, uint32_t &userdata0, uint32_t &userdata1) const
+	bool basisu_transcoder::get_userdata(const void* pData, uint32_t data_size, uint32_t& userdata0, uint32_t& userdata1) const
 	{
 		if (!validate_header_quick(pData, data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_userdata: header validation failed\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_userdata: header validation failed\n");
 			return false;
 		}
 
-		const basis_file_header *pHeader = static_cast<const basis_file_header *>(pData);
+		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
 
 		userdata0 = pHeader->m_userdata0;
 		userdata1 = pHeader->m_userdata1;
 		return true;
 	}
-	
-	uint32_t basisu_transcoder::get_total_images(const void *pData, uint32_t data_size) const
+
+	uint32_t basisu_transcoder::get_total_images(const void* pData, uint32_t data_size) const
 	{
 		if (!validate_header_quick(pData, data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header validation failed\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header validation failed\n");
 			return 0;
 		}
 
-		const basis_file_header *pHeader = static_cast<const basis_file_header *>(pData);
+		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
 
 		return pHeader->m_total_images;
 	}
 
-	bool basisu_transcoder::get_image_info(const void *pData, uint32_t data_size, basisu_image_info &image_info, uint32_t image_index) const
+	bool basisu_transcoder::get_image_info(const void* pData, uint32_t data_size, basisu_image_info& image_info, uint32_t image_index) const
 	{
 		if (!validate_header_quick(pData, data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: header validation failed\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: header validation failed\n");
 			return false;
 		}
-				
+
 		int slice_index = find_first_slice_index(pData, data_size, image_index, 0);
 		if (slice_index < 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid slice index\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid slice index\n");
 			return false;
 		}
 
-		const basis_file_header *pHeader = static_cast<const basis_file_header *>(pData);
+		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
 
 		if (image_index >= pHeader->m_total_images)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid image_index\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid image_index\n");
 			return false;
 		}
 
-		const basis_slice_desc *pSlice_descs = reinterpret_cast<const basis_slice_desc *>(static_cast<const uint8_t *>(pData) + pHeader->m_slice_desc_file_ofs);
+		const basis_slice_desc* pSlice_descs = reinterpret_cast<const basis_slice_desc*>(static_cast<const uint8_t*>(pData) + pHeader->m_slice_desc_file_ofs);
 
 		uint32_t total_levels = 1;
 		for (uint32_t i = slice_index + 1; i < pHeader->m_total_slices; i++)
 			if (pSlice_descs[i].m_image_index == image_index)
 				total_levels = basisu::maximum<uint32_t>(total_levels, pSlice_descs[i].m_level_index + 1);
-			else 
+			else
 				break;
 
 		if (total_levels > 16)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid image_index\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid image_index\n");
 			return false;
 		}
 
-		const basis_slice_desc &slice_desc = pSlice_descs[slice_index];
+		const basis_slice_desc& slice_desc = pSlice_descs[slice_index];
 
 		image_info.m_image_index = image_index;
 		image_info.m_total_levels = total_levels;
 		image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0;
+		image_info.m_iframe_flag = (slice_desc.m_flags & cSliceDescFlagsFrameIsIFrame) != 0;
 		image_info.m_width = slice_desc.m_num_blocks_x * 4;
 		image_info.m_height = slice_desc.m_num_blocks_y * 4;
 		image_info.m_orig_width = slice_desc.m_orig_width;
@@ -4279,112 +4370,114 @@
 
 		return true;
 	}
-	
-	uint32_t basisu_transcoder::get_total_image_levels(const void *pData, uint32_t data_size, uint32_t image_index) const
+
+	uint32_t basisu_transcoder::get_total_image_levels(const void* pData, uint32_t data_size, uint32_t image_index) const
 	{
 		if (!validate_header_quick(pData, data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: header validation failed\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: header validation failed\n");
 			return false;
 		}
 
 		int slice_index = find_first_slice_index(pData, data_size, image_index, 0);
 		if (slice_index < 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: failed finding slice\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: failed finding slice\n");
 			return false;
 		}
 
-		const basis_file_header *pHeader = static_cast<const basis_file_header *>(pData);
+		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
 
 		if (image_index >= pHeader->m_total_images)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: invalid image_index\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: invalid image_index\n");
 			return false;
 		}
 
-		const basis_slice_desc *pSlice_descs = reinterpret_cast<const basis_slice_desc *>(static_cast<const uint8_t *>(pData) + pHeader->m_slice_desc_file_ofs);
+		const basis_slice_desc* pSlice_descs = reinterpret_cast<const basis_slice_desc*>(static_cast<const uint8_t*>(pData) + pHeader->m_slice_desc_file_ofs);
 
 		uint32_t total_levels = 1;
 		for (uint32_t i = slice_index + 1; i < pHeader->m_total_slices; i++)
 			if (pSlice_descs[i].m_image_index == image_index)
 				total_levels = basisu::maximum<uint32_t>(total_levels, pSlice_descs[i].m_level_index + 1);
-			else 
+			else
 				break;
 
-		if (total_levels > 16)
+		const uint32_t cMaxSupportedLevels = 16;
+		if (total_levels > cMaxSupportedLevels)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: invalid image levels!\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: invalid image levels!\n");
 			return false;
 		}
-				
+
 		return total_levels;
 	}
-		
-	bool basisu_transcoder::get_image_level_desc(const void *pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, uint32_t &orig_width, uint32_t &orig_height, uint32_t &total_blocks) const
+
+	bool basisu_transcoder::get_image_level_desc(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, uint32_t& orig_width, uint32_t& orig_height, uint32_t& total_blocks) const
 	{
 		if (!validate_header_quick(pData, data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: header validation failed\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: header validation failed\n");
 			return false;
 		}
-				
+
 		int slice_index = find_first_slice_index(pData, data_size, image_index, level_index);
 		if (slice_index < 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: failed finding slice\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: failed finding slice\n");
 			return false;
 		}
 
-		const basis_file_header *pHeader = static_cast<const basis_file_header *>(pData);
+		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
 
 		if (image_index >= pHeader->m_total_images)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: invalid image_index\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: invalid image_index\n");
 			return false;
 		}
 
-		const basis_slice_desc *pSlice_descs = reinterpret_cast<const basis_slice_desc *>(static_cast<const uint8_t *>(pData) + pHeader->m_slice_desc_file_ofs);
-		
-		const basis_slice_desc &slice_desc = pSlice_descs[slice_index];
+		const basis_slice_desc* pSlice_descs = reinterpret_cast<const basis_slice_desc*>(static_cast<const uint8_t*>(pData) + pHeader->m_slice_desc_file_ofs);
+
+		const basis_slice_desc& slice_desc = pSlice_descs[slice_index];
 
 		orig_width = slice_desc.m_orig_width;
 		orig_height = slice_desc.m_orig_height;
 		total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
-				
+
 		return true;
 	}
 
-	bool basisu_transcoder::get_image_level_info(const void *pData, uint32_t data_size, basisu_image_level_info &image_info, uint32_t image_index, uint32_t level_index) const
+	bool basisu_transcoder::get_image_level_info(const void* pData, uint32_t data_size, basisu_image_level_info& image_info, uint32_t image_index, uint32_t level_index) const
 	{
 		if (!validate_header_quick(pData, data_size))
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: validate_file_checksums failed\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: validate_file_checksums failed\n");
 			return false;
 		}
-				
+
 		int slice_index = find_first_slice_index(pData, data_size, image_index, level_index);
 		if (slice_index < 0)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: failed finding slice\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: failed finding slice\n");
 			return false;
 		}
 
-		const basis_file_header *pHeader = static_cast<const basis_file_header *>(pData);
+		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
 
 		if (image_index >= pHeader->m_total_images)
 		{
-			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: invalid image_index\n");		
+			BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: invalid image_index\n");
 			return false;
 		}
 
-		const basis_slice_desc *pSlice_descs = reinterpret_cast<const basis_slice_desc *>(static_cast<const uint8_t *>(pData) + pHeader->m_slice_desc_file_ofs);
+		const basis_slice_desc* pSlice_descs = reinterpret_cast<const basis_slice_desc*>(static_cast<const uint8_t*>(pData) + pHeader->m_slice_desc_file_ofs);
 
-		const basis_slice_desc &slice_desc = pSlice_descs[slice_index];
+		const basis_slice_desc& slice_desc = pSlice_descs[slice_index];
 
 		image_info.m_image_index = image_index;
 		image_info.m_level_index = level_index;
 		image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0;
+		image_info.m_iframe_flag = (slice_desc.m_flags & cSliceDescFlagsFrameIsIFrame) != 0;
 		image_info.m_width = slice_desc.m_num_blocks_x * 4;
 		image_info.m_height = slice_desc.m_num_blocks_y * 4;
 		image_info.m_orig_width = slice_desc.m_orig_width;
@@ -4397,17 +4490,17 @@
 		return true;
 	}
 
-	bool basisu_transcoder::get_file_info(const void* pData, uint32_t data_size, basisu_file_info & file_info) const
+	bool basisu_transcoder::get_file_info(const void* pData, uint32_t data_size, basisu_file_info& file_info) const
 	{
 		if (!validate_file_checksums(pData, data_size, false))
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: validate_file_checksums failed\n");
 			return false;
 		}
-				
+
 		const basis_file_header* pHeader = static_cast<const basis_file_header*>(pData);
 		const basis_slice_desc* pSlice_descs = reinterpret_cast<const basis_slice_desc*>(static_cast<const uint8_t*>(pData) + pHeader->m_slice_desc_file_ofs);
-				
+
 		file_info.m_version = pHeader->m_ver;
 
 		file_info.m_total_header_size = sizeof(basis_file_header) + pHeader->m_total_slices * sizeof(basis_slice_desc);
@@ -4441,7 +4534,7 @@
 		file_info.m_us_per_frame = pHeader->m_us_per_frame;
 		file_info.m_userdata0 = pHeader->m_userdata0;
 		file_info.m_userdata1 = pHeader->m_userdata1;
-				
+
 		file_info.m_image_mipmap_levels.resize(0);
 		file_info.m_image_mipmap_levels.resize(pHeader->m_total_images);
 
@@ -4452,7 +4545,7 @@
 			file_info.m_slices_size += pSlice_descs[i].m_file_size;
 
 			basisu_slice_info& slice_info = file_info.m_slice_info[i];
-						
+
 			slice_info.m_orig_width = pSlice_descs[i].m_orig_width;
 			slice_info.m_orig_height = pSlice_descs[i].m_orig_height;
 			slice_info.m_width = pSlice_descs[i].m_num_blocks_x * 4;
@@ -4466,6 +4559,7 @@
 			slice_info.m_level_index = pSlice_descs[i].m_level_index;
 			slice_info.m_unpacked_slice_crc16 = pSlice_descs[i].m_slice_data_crc16;
 			slice_info.m_alpha_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsIsAlphaData) != 0;
+			slice_info.m_iframe_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsFrameIsIFrame) != 0;
 
 			if (pSlice_descs[i].m_image_index >= pHeader->m_total_images)
 			{
@@ -4485,23 +4579,23 @@
 		return true;
 	}
 
-	bool basisu_transcoder::start_transcoding(const void *pData, uint32_t data_size) const
+	bool basisu_transcoder::start_transcoding(const void* pData, uint32_t data_size) const
 	{
 		if (m_lowlevel_decoder.m_endpoints.size())
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: already called start_transcoding\n");
 			return true;
 		}
-	
+
 		if (!validate_header_quick(pData, data_size))
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: header validation failed\n");
 			return false;
 		}
 
-		const basis_file_header *pHeader = reinterpret_cast<const basis_file_header*>(pData);
+		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
 
-		const uint8_t *pDataU8 = static_cast<const uint8_t *>(pData);
+		const uint8_t* pDataU8 = static_cast<const uint8_t*>(pData);
 
 		if (!pHeader->m_endpoint_cb_file_size || !pHeader->m_selector_cb_file_size || !pHeader->m_tables_file_size)
 		{
@@ -4513,7 +4607,7 @@
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: file is corrupted or passed in buffer too small (1)\n");
 			return false;
 		}
-		
+
 		if (pHeader->m_endpoint_cb_file_size > (data_size - pHeader->m_endpoint_cb_file_ofs))
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: file is corrupted or passed in buffer too small (2)\n");
@@ -4549,31 +4643,31 @@
 		return true;
 	}
 
-	bool basisu_transcoder::transcode_slice(const void *pData, uint32_t data_size, uint32_t slice_index, void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks, block_format fmt, 
-		uint32_t output_block_stride_in_bytes, uint32_t decode_flags, uint32_t output_row_pitch_in_blocks) const
+	bool basisu_transcoder::transcode_slice(const void* pData, uint32_t data_size, uint32_t slice_index, void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks, block_format fmt,
+		uint32_t output_block_stride_in_bytes, uint32_t decode_flags, uint32_t output_row_pitch_in_blocks, basisu_transcoder_state* pState) const
 	{
 		if (!m_lowlevel_decoder.m_endpoints.size())
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: must call start_transcoding first\n");
 			return false;
 		}
-			
+
 		if (decode_flags & cDecodeFlagsPVRTCDecodeToNextPow2)
 		{
 			// TODO: Not yet supported
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: cDecodeFlagsPVRTCDecodeToNextPow2 currently unsupported\n");
 			return false;
 		}
-		
+
 		if (!validate_header_quick(pData, data_size))
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: header validation failed\n");
 			return false;
 		}
-			
-		const basis_file_header *pHeader = reinterpret_cast<const basis_file_header *>(pData);
 
-		const uint8_t *pDataU8 = static_cast<const uint8_t* >(pData);
+		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
+
+		const uint8_t* pDataU8 = static_cast<const uint8_t*>(pData);
 
 		if (slice_index >= pHeader->m_total_slices)
 		{
@@ -4581,7 +4675,7 @@
 			return false;
 		}
 
-		const basis_slice_desc &slice_desc = reinterpret_cast<const basis_slice_desc *>(pDataU8 + pHeader->m_slice_desc_file_ofs)[slice_index];
+		const basis_slice_desc& slice_desc = reinterpret_cast<const basis_slice_desc*>(pDataU8 + pHeader->m_slice_desc_file_ofs)[slice_index];
 
 		uint32_t total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
 		if (output_blocks_buf_size_in_blocks < total_blocks)
@@ -4615,34 +4709,34 @@
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: invalid slice_desc.m_file_size, or passed in buffer too small\n");
 			return false;
 		}
-				
+
 		return m_lowlevel_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y,
 			pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size,
-			fmt, output_block_stride_in_bytes, (decode_flags & cDecodeFlagsPVRTCWrapAddressing) != 0, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, output_row_pitch_in_blocks);
+			fmt, output_block_stride_in_bytes, (decode_flags & cDecodeFlagsPVRTCWrapAddressing) != 0, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks, pState);
 	}
 
-	int basisu_transcoder::find_first_slice_index(const void *pData, uint32_t data_size, uint32_t image_index, uint32_t level_index) const
+	int basisu_transcoder::find_first_slice_index(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index) const
 	{
 		(void)data_size;
 
-		const basis_file_header *pHeader = reinterpret_cast<const basis_file_header*>(pData);
-		const uint8_t *pDataU8 = static_cast<const uint8_t*>(pData);
+		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
+		const uint8_t* pDataU8 = static_cast<const uint8_t*>(pData);
 
 		// For very large basis files this search could be painful
 		// TODO: Binary search this
 		for (uint32_t slice_iter = 0; slice_iter < pHeader->m_total_slices; slice_iter++)
 		{
-			const basis_slice_desc &slice_desc = reinterpret_cast<const basis_slice_desc *>(pDataU8 + pHeader->m_slice_desc_file_ofs)[slice_iter];
+			const basis_slice_desc& slice_desc = reinterpret_cast<const basis_slice_desc*>(pDataU8 + pHeader->m_slice_desc_file_ofs)[slice_iter];
 			if ((slice_desc.m_image_index == image_index) && (slice_desc.m_level_index == level_index))
 				return slice_iter;
 		}
-		
+
 		BASISU_DEVEL_ERROR("basisu_transcoder::find_first_slice_index: didn't find slice\n");
 
 		return -1;
 	}
 
-	int basisu_transcoder::find_slice(const void *pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const
+	int basisu_transcoder::find_slice(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const
 	{
 		if (!validate_header_quick(pData, data_size))
 		{
@@ -4650,9 +4744,9 @@
 			return false;
 		}
 
-		const basis_file_header *pHeader = reinterpret_cast<const basis_file_header*>(pData);
-		const uint8_t *pDataU8 = static_cast<const uint8_t*>(pData);
-		const basis_slice_desc *pSlice_descs = reinterpret_cast<const basis_slice_desc*>(pDataU8 + pHeader->m_slice_desc_file_ofs);
+		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
+		const uint8_t* pDataU8 = static_cast<const uint8_t*>(pData);
+		const basis_slice_desc* pSlice_descs = reinterpret_cast<const basis_slice_desc*>(pDataU8 + pHeader->m_slice_desc_file_ofs);
 
 		// For very large basis files this search could be painful
 		// TODO: Binary search this
@@ -4666,22 +4760,22 @@
 					return slice_iter;
 			}
 		}
-		
+
 		BASISU_DEVEL_ERROR("basisu_transcoder::find_slice: didn't find slice\n");
 
 		return -1;
 	}
 
 	static void write_opaque_alpha_blocks(
-		uint32_t num_blocks_x, uint32_t num_blocks_y, 
-		void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks, block_format fmt, 
+		uint32_t num_blocks_x, uint32_t num_blocks_y,
+		void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks, block_format fmt,
 		uint32_t block_stride_in_bytes, uint32_t output_row_pitch_in_blocks)
 	{
 		BASISU_NOTE_UNUSED(output_blocks_buf_size_in_blocks);
 
 		if (!output_row_pitch_in_blocks)
 			output_row_pitch_in_blocks = num_blocks_x;
-		
+
 		if (fmt == cETC2_EAC_A8)
 		{
 #if BASISD_SUPPORT_ETC2_EAC_A8
@@ -4689,7 +4783,7 @@
 			blk.m_base = 255;
 			blk.m_multiplier = 1;
 			blk.m_table = 13;
-			
+
 			// Selectors are all 4's
 			static const uint8_t s_etc2_eac_a8_sel4[6] = { 0x92, 0x49, 0x24, 0x92, 0x49, 0x24 };
 			memcpy(&blk.m_selectors, s_etc2_eac_a8_sel4, sizeof(s_etc2_eac_a8_sel4));
@@ -4699,7 +4793,7 @@
 				uint32_t dst_ofs = y * output_row_pitch_in_blocks * block_stride_in_bytes;
 				for (uint32_t x = 0; x < num_blocks_x; x++)
 				{
-					memcpy((uint8_t *)pOutput_blocks + dst_ofs, &blk, sizeof(blk));
+					memcpy((uint8_t*)pOutput_blocks + dst_ofs, &blk, sizeof(blk));
 					dst_ofs += block_stride_in_bytes;
 				}
 			}
@@ -4712,33 +4806,33 @@
 			blk.m_endpoints[0] = 255;
 			blk.m_endpoints[1] = 255;
 			memset(blk.m_selectors, 0, sizeof(blk.m_selectors));
-			
+
 			for (uint32_t y = 0; y < num_blocks_y; y++)
 			{
 				uint32_t dst_ofs = y * output_row_pitch_in_blocks * block_stride_in_bytes;
 				for (uint32_t x = 0; x < num_blocks_x; x++)
 				{
-					memcpy((uint8_t *)pOutput_blocks + dst_ofs, &blk, sizeof(blk));
+					memcpy((uint8_t*)pOutput_blocks + dst_ofs, &blk, sizeof(blk));
 					dst_ofs += block_stride_in_bytes;
 				}
 			}
 #endif
 		}
 	}
-		
+
 	bool basisu_transcoder::transcode_image_level(
-		const void *pData, uint32_t data_size,
-		uint32_t image_index, uint32_t level_index, 
-		void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks,
+		const void* pData, uint32_t data_size,
+		uint32_t image_index, uint32_t level_index,
+		void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks,
 		transcoder_texture_format fmt,
-		uint32_t decode_flags, uint32_t output_row_pitch_in_blocks) const
+		uint32_t decode_flags, uint32_t output_row_pitch_in_blocks, basisu_transcoder_state *pState) const
 	{
 		if (!m_lowlevel_decoder.m_endpoints.size())
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: must call start_transcoding() first\n");
 			return false;
 		}
-					
+
 		const bool transcode_alpha_data_to_opaque_formats = (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0;
 
 		if (decode_flags & cDecodeFlagsPVRTCDecodeToNextPow2)
@@ -4754,14 +4848,14 @@
 			return false;
 		}
 
-		const basis_file_header *pHeader = reinterpret_cast<const basis_file_header*>(pData);
+		const basis_file_header* pHeader = reinterpret_cast<const basis_file_header*>(pData);
 
-		const uint8_t *pDataU8 = static_cast<const uint8_t*>(pData);
+		const uint8_t* pDataU8 = static_cast<const uint8_t*>(pData);
 
-		const basis_slice_desc *pSlice_descs = reinterpret_cast<const basis_slice_desc*>(pDataU8 + pHeader->m_slice_desc_file_ofs);
+		const basis_slice_desc* pSlice_descs = reinterpret_cast<const basis_slice_desc*>(pDataU8 + pHeader->m_slice_desc_file_ofs);
 
 		const bool basis_file_has_alpha_slices = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0;
-		
+
 		int slice_index = find_first_slice_index(pData, data_size, image_index, level_index);
 		if (slice_index < 0)
 		{
@@ -4769,7 +4863,7 @@
 			// Unable to find the requested image/level 
 			return false;
 		}
-				
+
 		uint32_t total_slices = 1;
 		switch (fmt)
 		{
@@ -4781,11 +4875,12 @@
 		default:
 			break;
 		}
-				
+		(void)total_slices;
+
 		if (pSlice_descs[slice_index].m_flags & cSliceDescFlagsIsAlphaData)
 		{
 			BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has out of order alpha slice\n");
-			
+
 			// The first slice shouldn't have alpha data in a properly formed basis file
 			return false;
 		}
@@ -4807,7 +4902,7 @@
 				// This slice should have alpha data
 				return false;
 			}
-						
+
 			if ((pSlice_descs[slice_index].m_num_blocks_x != pSlice_descs[slice_index + 1].m_num_blocks_x) || (pSlice_descs[slice_index].m_num_blocks_y != pSlice_descs[slice_index + 1].m_num_blocks_y))
 			{
 				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file slice dimensions bad\n");
@@ -4832,19 +4927,27 @@
 		bool status = false;
 
 		const uint32_t total_slice_blocks = pSlice_descs[slice_index].m_num_blocks_x * pSlice_descs[slice_index].m_num_blocks_y;
-								
+
+		if ((fmt == cTFPVRTC1_4_OPAQUE_ONLY) && (output_blocks_buf_size_in_blocks > total_slice_blocks))
+		{
+			// The transcoder doesn't write beyond total_slice_blocks, so we need to clear the rest ourselves.
+			// For GL usage, PVRTC1 4bpp image size is (max(width, 8)* max(height, 8) * 4 + 7) / 8. 
+			// However, for KTX and internally in Basis this formula isn't used, it's just ((width+3)/4) * ((height+3)/4) * bytes_per_block. This is all the transcoder actually writes to memory.
+			memset(static_cast<uint8_t*>(pOutput_blocks) + total_slice_blocks * bytes_per_block, 0, (output_blocks_buf_size_in_blocks - total_slice_blocks) * bytes_per_block);
+		}
+
 		switch (fmt)
 		{
 		case cTFETC1:
 		{
 			assert(total_slices == 1);
-			
+
 			uint32_t slice_index_to_decode = slice_index;
 			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
 			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
 				slice_index_to_decode++;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cETC1, bytes_per_block, decode_flags, output_row_pitch_in_blocks);
+			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cETC1, bytes_per_block, decode_flags, output_row_pitch_in_blocks, pState);
 			if (!status)
 			{
 				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ETC1 failed\n");
@@ -4863,7 +4966,7 @@
 			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
 				slice_index_to_decode++;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC1, bytes_per_block, decode_flags, output_row_pitch_in_blocks);
+			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC1, bytes_per_block, decode_flags, output_row_pitch_in_blocks, pState);
 			if (!status)
 			{
 				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC1 failed\n");
@@ -4882,7 +4985,7 @@
 			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
 				slice_index_to_decode++;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC4, bytes_per_block, decode_flags, output_row_pitch_in_blocks);
+			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC4, bytes_per_block, decode_flags, output_row_pitch_in_blocks, pState);
 			if (!status)
 			{
 				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC4 failed\n");
@@ -4902,7 +5005,7 @@
 				slice_index_to_decode++;
 
 			// output_row_pitch_in_blocks is actually ignored because we're transcoding to PVRTC1. (Print a dev warning if it's != 0?)
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cPVRTC1_4_OPAQUE_ONLY, bytes_per_block, decode_flags, output_row_pitch_in_blocks);
+			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cPVRTC1_4_OPAQUE_ONLY, bytes_per_block, decode_flags, output_row_pitch_in_blocks, pState);
 			if (!status)
 			{
 				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to PVRTC1 4 opaque only failed\n");
@@ -4915,13 +5018,13 @@
 			return false;
 #endif
 			assert(total_slices == 1);
-			
+
 			uint32_t slice_index_to_decode = slice_index;
 			// If the caller wants us to transcode the mip level's alpha data, then use the next slice.
 			if ((basis_file_has_alpha_slices) && (transcode_alpha_data_to_opaque_formats))
 				slice_index_to_decode++;
 
-			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC7_M6_OPAQUE_ONLY, bytes_per_block, decode_flags, output_row_pitch_in_blocks);
+			status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC7_M6_OPAQUE_ONLY, bytes_per_block, decode_flags, output_row_pitch_in_blocks, pState);
 			if (!status)
 			{
 				BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC7 m6 opaque only failed\n");
@@ -4934,11 +5037,11 @@
 			return false;
 #endif
 			assert(total_slices == 2);
-			
+
 			if (basis_file_has_alpha_slices)
 			{
 				// First decode the alpha data 
-				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks, cETC2_EAC_A8, 16, decode_flags, output_row_pitch_in_blocks);
+				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks, cETC2_EAC_A8, 16, decode_flags, output_row_pitch_in_blocks, pState);
 			}
 			else
 			{
@@ -4949,7 +5052,7 @@
 			if (status)
 			{
 				// Now decode the color data
-				status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks, cETC1, 16, decode_flags, output_row_pitch_in_blocks);
+				status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks, cETC1, 16, decode_flags, output_row_pitch_in_blocks, pState);
 				if (!status)
 				{
 					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to ETC2 RGB failed\n");
@@ -4974,7 +5077,7 @@
 			// First decode the alpha data 
 			if (basis_file_has_alpha_slices)
 			{
-				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC4, 16, decode_flags, output_row_pitch_in_blocks);
+				status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC4, 16, decode_flags, output_row_pitch_in_blocks, pState);
 			}
 			else
 			{
@@ -4985,7 +5088,7 @@
 			if (status)
 			{
 				// Now decode the color data. Forbid 3 color blocks, which aren't allowed in BC3.
-				status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks, cBC1, 16, decode_flags | cDecodeFlagsBC1ForbidThreeColorBlocks, output_row_pitch_in_blocks);
+				status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks, cBC1, 16, decode_flags | cDecodeFlagsBC1ForbidThreeColorBlocks, output_row_pitch_in_blocks, pState);
 				if (!status)
 				{
 					BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC3 RGB failed\n");
@@ -5005,13 +5108,13 @@
 #endif
 			assert(total_slices == 2);
 			// Decode the R data (actually the green channel of the color data slice in the basis file)
-			status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC4, 16, decode_flags, output_row_pitch_in_blocks);
+			status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks, cBC4, 16, decode_flags, output_row_pitch_in_blocks, pState);
 			if (status)
 			{
 				if (basis_file_has_alpha_slices)
 				{
 					// Decode the G data (actually the green channel of the alpha data slice in the basis file)
-					status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks, cBC4, 16, decode_flags, output_row_pitch_in_blocks);
+					status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks, cBC4, 16, decode_flags, output_row_pitch_in_blocks, pState);
 					if (!status)
 					{
 						BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: transcode_slice() to BC5 1 failed\n");
@@ -5062,7 +5165,7 @@
 		return 0;
 	}
 
-	const char *basis_get_format_name(transcoder_texture_format fmt)
+	const char* basis_get_format_name(transcoder_texture_format fmt)
 	{
 		switch (fmt)
 		{
@@ -5082,19 +5185,19 @@
 		return "";
 	}
 
-	const char *basis_get_texture_type_name(basis_texture_type tex_type)
+	const char* basis_get_texture_type_name(basis_texture_type tex_type)
 	{
 		switch (tex_type)
 		{
-			case cBASISTexType2D: return "2D";
-			case cBASISTexType2DArray: return "2D array";
-			case cBASISTexTypeCubemapArray: return "cubemap array";
-			case cBASISTexTypeVideoFrames: return "video";
-			case cBASISTexTypeVolume: return "3D";
-			default: 
-				assert(0);
-				BASISU_DEVEL_ERROR("basis_get_texture_type_name: Invalid tex_type\n");
-				break;
+		case cBASISTexType2D: return "2D";
+		case cBASISTexType2DArray: return "2D array";
+		case cBASISTexTypeCubemapArray: return "cubemap array";
+		case cBASISTexTypeVideoFrames: return "video";
+		case cBASISTexTypeVolume: return "3D";
+		default:
+			assert(0);
+			BASISU_DEVEL_ERROR("basis_get_texture_type_name: Invalid tex_type\n");
+			break;
 		}
 		return "";
 	}
@@ -5103,8 +5206,8 @@
 	{
 		switch (fmt)
 		{
-		case cTFETC2: 
-		case cTFBC3: 
+		case cTFETC2:
+		case cTFBC3:
 			return true;
 		default:
 			break;
@@ -5116,18 +5219,18 @@
 	{
 		switch (fmt)
 		{
-			case cTFETC1: return basisu::cETC1;
-			case cTFBC1: return basisu::cBC1;
-			case cTFBC4: return basisu::cBC4;
-			case cTFPVRTC1_4_OPAQUE_ONLY: return basisu::cPVRTC1_4_RGB;
-			case cTFBC7_M6_OPAQUE_ONLY: return basisu::cBC7;
-			case cTFETC2: return basisu::cETC2_RGBA;
-			case cTFBC3: return basisu::cBC3;
-			case cTFBC5: return basisu::cBC5;
-			default:
-				assert(0);
-				BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
-				break;
+		case cTFETC1: return basisu::cETC1;
+		case cTFBC1: return basisu::cBC1;
+		case cTFBC4: return basisu::cBC4;
+		case cTFPVRTC1_4_OPAQUE_ONLY: return basisu::cPVRTC1_4_RGB;
+		case cTFBC7_M6_OPAQUE_ONLY: return basisu::cBC7;
+		case cTFETC2: return basisu::cETC2_RGBA;
+		case cTFBC3: return basisu::cBC3;
+		case cTFBC5: return basisu::cBC5;
+		default:
+			assert(0);
+			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
+			break;
 		}
 		return basisu::cInvalidTextureFormat;
 	}
diff --git a/transcoder/basisu_transcoder.h b/transcoder/basisu_transcoder.h
index c70845a..bd5807d 100644
--- a/transcoder/basisu_transcoder.h
+++ b/transcoder/basisu_transcoder.h
@@ -59,6 +59,22 @@
 	const char *basis_get_texture_type_name(basis_texture_type tex_type);
 	
 	class basisu_transcoder;
+
+	// This struct holds all state used during transcoding. For video, it needs to persist between image transcodes (it holds the previous frame).
+	// For threading you can use one state per thread.
+	struct basisu_transcoder_state
+	{
+		struct block_preds
+		{
+			uint16_t m_endpoint_index;
+			uint8_t m_pred_bits;
+		};
+
+		std::vector<block_preds> m_block_endpoint_preds[2];
+		
+		enum { cMaxPrevFrameLevels = 16 };
+		std::vector<uint32_t> m_prev_frame_indices[2][cMaxPrevFrameLevels]; // [alpha_flag][level_index] 
+	};
 	
 	class basisu_lowlevel_transcoder
 	{
@@ -74,7 +90,7 @@
 		bool decode_tables(const uint8_t *pTable_data, uint32_t table_data_size);
 
 		bool transcode_slice(void *pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t *pImage_data, uint32_t image_data_size, block_format fmt, 
-			uint32_t output_stride, bool wrap_addressing, bool bc1_allow_threecolor_blocks, uint32_t output_row_pitch_in_blocks = 0);
+			uint32_t output_stride, bool wrap_addressing, bool bc1_allow_threecolor_blocks, const basis_file_header &header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks = 0, basisu_transcoder_state *pState = nullptr);
 
 	private:
 		struct endpoint
@@ -94,14 +110,8 @@
 		huffman_decoding_table m_endpoint_pred_model, m_delta_endpoint_model, m_selector_model, m_selector_history_buf_rle_model;
 
 		uint32_t m_selector_history_buf_size;
-
-		struct block_preds
-		{
-			uint16_t m_endpoint_index;
-			uint8_t m_pred_bits;
-		};
-
-		std::vector<block_preds> m_block_endpoint_preds[2];
+		
+		basisu_transcoder_state m_def_state;
 	};
 
 	struct basisu_slice_info
@@ -125,6 +135,7 @@
 		uint32_t m_unpacked_slice_crc16;
 		
 		bool m_alpha_flag;		// true if the slice has alpha data
+		bool m_iframe_flag;		// true if the slice is an I-Frame
 	};
 
 	typedef std::vector<basisu_slice_info> basisu_slice_info_vec;
@@ -147,6 +158,7 @@
 		uint32_t m_first_slice_index;	
 								
 		bool m_alpha_flag;		// true if the image has alpha data
+		bool m_iframe_flag;		// true if the image is an I-Frame
 	};
 
 	struct basisu_image_level_info
@@ -167,6 +179,7 @@
 		uint32_t m_first_slice_index;	
 								
 		bool m_alpha_flag;		// true if the image has alpha data
+		bool m_iframe_flag;		// true if the image is an I-Frame
 	};
 
 	struct basisu_file_info
@@ -270,7 +283,7 @@
 			uint32_t image_index, uint32_t level_index, 
 			void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks,
 			transcoder_texture_format fmt,
-			uint32_t decode_flags = cDecodeFlagsPVRTCWrapAddressing, uint32_t output_row_pitch_in_blocks = 0) const;
+			uint32_t decode_flags = cDecodeFlagsPVRTCWrapAddressing, uint32_t output_row_pitch_in_blocks = 0, basisu_transcoder_state *pState = nullptr) const;
 
 		// Finds the basis slice corresponding to the specified image/level/alpha params, or -1 if the slice can't be found.
 		int find_slice(const void *pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const;
@@ -284,12 +297,9 @@
 		// output_row_pitch_in_blocks: Number of blocks per row. If 0, the transcoder uses the slice's num_blocks_x. Ignored for PVRTC1 (due to texture swizzling).
 		bool transcode_slice(const void *pData, uint32_t data_size, uint32_t slice_index, 
 			void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks, 
-			block_format fmt, uint32_t output_block_stride_in_bytes, uint32_t decode_flags = cDecodeFlagsPVRTCWrapAddressing, uint32_t output_row_pitch_in_blocks = 0) const;
+			block_format fmt, uint32_t output_block_stride_in_bytes, uint32_t decode_flags = cDecodeFlagsPVRTCWrapAddressing, uint32_t output_row_pitch_in_blocks = 0, basisu_transcoder_state * pState = nullptr) const;
 
 	private:
-		const void *m_pFile_data;
-		uint32_t m_file_data_size;
-
 		mutable basisu_lowlevel_transcoder m_lowlevel_decoder;
 
 		int find_first_slice_index(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index) const;
@@ -300,4 +310,12 @@
 	// basisu_transcoder_init() must be called before a .basis file can be transcoded.
 	void basisu_transcoder_init();
 
+	enum debug_flags_t
+	{
+		cDebugFlagVisCRs = 1,
+		cDebugFlagVisBC1Sels = 2,
+		cDebugFlagVisBC1Endpoints = 4
+	};
+	uint32_t get_debug_flags();
+	void set_debug_flags(uint32_t f);
 } // namespace basisu
diff --git a/transcoder/basisu_transcoder_internal.h b/transcoder/basisu_transcoder_internal.h
index d4d210e..cf1c3d4 100644
--- a/transcoder/basisu_transcoder_internal.h
+++ b/transcoder/basisu_transcoder_internal.h
@@ -33,6 +33,11 @@
 
 #define BASISD_znew (z = 36969 * (z & 65535) + (z >> 16))
 
+namespace basisu
+{
+	extern bool g_debug_printf;
+}
+
 namespace basist
 {
 	const int COLOR5_PAL0_PREV_HI = 9, COLOR5_PAL0_DELTA_LO = -9, COLOR5_PAL0_DELTA_HI = 31;
@@ -45,6 +50,9 @@
 	const uint32_t ENDPOINT_PRED_MIN_REPEAT_COUNT = 3;
 	const uint32_t ENDPOINT_PRED_COUNT_VLC_BITS = 4;
 
+	const uint32_t NUM_ENDPOINT_PREDS = 3;// BASISU_ARRAY_SIZE(g_endpoint_preds);
+	const uint32_t CR_ENDPOINT_PRED_INDEX = NUM_ENDPOINT_PREDS - 1;
+	const uint32_t NO_ENDPOINT_PRED_INDEX = 3;//NUM_ENDPOINT_PREDS;
 	const uint32_t MAX_SELECTOR_HISTORY_BUF_SIZE = 64;
 	const uint32_t SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH = 3;
 	const uint32_t SELECTOR_HISTORY_BUF_RLE_COUNT_BITS = 6;
diff --git a/webgl/transcoder/basis_wrappers.cpp b/webgl/transcoder/basis_wrappers.cpp
index 408cbd7..d389d5a 100644
--- a/webgl/transcoder/basis_wrappers.cpp
+++ b/webgl/transcoder/basis_wrappers.cpp
@@ -1,6 +1,7 @@
 // basis_wrappers.cpp - Simple C-style wrappers to the C++ transcoder for WebGL use.
 #include "basisu_transcoder.h"
 #include <emscripten/bind.h>
+#include <algorithm>
 
 using namespace emscripten;
 using namespace basist;
@@ -118,6 +119,16 @@
     if (!m_transcoder.get_image_level_desc(m_file.data(), m_file.size(), image_index, level_index, orig_width, orig_height, total_blocks))
       return 0;
 
+    if (format == cTFPVRTC1_4_OPAQUE_ONLY)
+    {
+	    // For PVRTC1, Basis only writes (or requires) total_blocks * bytes_per_block. But GL requires extra padding for very small textures: 
+        // https://www.khronos.org/registry/OpenGL/extensions/IMG/IMG_texture_compression_pvrtc.txt
+        const uint32_t width = (orig_width + 3) & ~3;
+        const uint32_t height = (orig_height + 3) & ~3;
+        const uint32_t size_in_bytes = (std::max(8U, width) * std::max(8U, height) * 4 + 7) / 8;
+        return size_in_bytes;
+    }
+
     return total_blocks * bytes_per_block;
   }
 
@@ -145,6 +156,17 @@
 
     uint32_t required_size = total_blocks * bytes_per_block;
 
+    if (format == cTFPVRTC1_4_OPAQUE_ONLY)
+    {
+		// For PVRTC1, Basis only writes (or requires) total_blocks * bytes_per_block. But GL requires extra padding for very small textures: 
+		// https://www.khronos.org/registry/OpenGL/extensions/IMG/IMG_texture_compression_pvrtc.txt
+		// The transcoder will clear the extra bytes followed the used blocks to 0.
+        const uint32_t width = (orig_width + 3) & ~3;
+        const uint32_t height = (orig_height + 3) & ~3;
+        required_size = (std::max(8U, width) * std::max(8U, height) * 4 + 7) / 8;
+        assert(required_size >= total_blocks * bytes_per_block);
+    }
+
     std::vector<uint8_t> dst_data(required_size);
 
     uint32_t status = m_transcoder.transcode_image_level(