Basis Universal v1.13:
- Optimized ETC1S encoder (3-4.5x faster)
- Added optional SSE 4.1 support to encoder
- Switched from std::vector to a custom vector in the encoder and transcoder
- Added CppSPMD (SSE only for now)
- UASTC RDO is now more effective, but the command line parameter controlling qualiy has changed (to "lambda")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f632e3..b7ba452 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,7 @@
 cmake_minimum_required(VERSION 3.0)
 option(BUILD_X64 "build 64-bit" TRUE)
 option(STATIC "static linking" FALSE)
+option(SSE "SSE 4.1 support" FALSE)
 
 message("Initial BUILD_X64=${BUILD_X64}")
 message("Initial CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
@@ -19,6 +20,12 @@
 	message("Building 32-bit")
 endif()
 
+if (SSE)
+	message("SSE enabled")
+else()
+	message("SSE disabled")
+endif()
+
 if (NOT MSVC)
    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
@@ -34,13 +41,29 @@
    endif()
 
    if (EMSCRIPTEN)
-	  set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -s ALLOW_MEMORY_GROWTH=1")
-	  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -s ALLOW_MEMORY_GROWTH=1")
+	  set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -s ALLOW_MEMORY_GROWTH=1 -DBASISU_SUPPORT_SSE=0")
+	  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -s ALLOW_MEMORY_GROWTH=1 -DBASISU_SUPPORT_SSE=0")
 
 	  set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${GCC_LINK_FLAGS}")
    elseif (STATIC)
+      if (SSE)
+		set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DBASISU_SUPPORT_SSE=1 -msse4.1")
+		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DBASISU_SUPPORT_SSE=1 -msse4.1")
+	  else()
+	  	set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DBASISU_SUPPORT_SSE=0")
+		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DBASISU_SUPPORT_SSE=0")
+	  endif()
+	  
 	  set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${GCC_LINK_FLAGS} -static-libgcc -static-libstdc++ -static")
    else()
+   	  if (SSE)
+		set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DBASISU_SUPPORT_SSE=1 -msse4.1")
+		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DBASISU_SUPPORT_SSE=1 -msse4.1")
+	  else()
+	  	set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DBASISU_SUPPORT_SSE=0")
+		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DBASISU_SUPPORT_SSE=0")
+	  endif()
+	  
 	  set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${GCC_LINK_FLAGS} -Wl,-rpath .")
    endif()
 
@@ -51,6 +74,14 @@
    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${GCC_COMPILE_FLAGS}")
    set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
+else()
+	if (SSE)
+		set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DBASISU_SUPPORT_SSE=1")
+		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DBASISU_SUPPORT_SSE=1")
+	else()
+		set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DBASISU_SUPPORT_SSE=0")
+		set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DBASISU_SUPPORT_SSE=0")
+	endif()
 endif()
 
 set(BASISU_SRC_LIST ${COMMON_SRC_LIST} 
@@ -73,6 +104,7 @@
 	encoder/lodepng.cpp
 	encoder/apg_bmp.c
 	encoder/jpgd.cpp
+	encoder/basisu_kernels_sse.cpp
 	transcoder/basisu_transcoder.cpp
 	)
 
diff --git a/README.md b/README.md
index 7579a79..8d5a0c1 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,8 @@
 So far, we've compiled the code using MSVS 2019, under Ubuntu x64 using cmake with either clang 3.8 or gcc 5.4, and emscripten 1.35 to asm.js. (Be sure to use this version or later of emcc, as earlier versions fail with internal errors/exceptions during compilation.) The compressor is multithreaded by default, but this can be disabled using the -no_multithreading command line option. The transcoder is currently single threaded.
 
 Basis Universal supports "skip blocks" in ETC1S compressed texture arrays, which makes it useful for basic [compressed texture video](http://gamma.cs.unc.edu/MPTC/) applications. Note that Basis Universal is still at heart a GPU texture compression system, not a dedicated video codec, so bitrates will be larger than even MPEG1.
+1/10/21 release notes:
+- We've added numerous ETC1S encoder optimizations designed to greatly speed up single threaded encoding time, as well as greatly reducing overall CPU utilization when multithreading is enabled. For benchmarking, we're using "-q 128 -no_multithreading -mip_fast". The encoder now uses approximately 1/3rd as much total CPU time for the same basic PSNR. The encoder can now optionally utilize SSE 4.1 - see the "-no_sse" command line option.
 
 [Release Notes](https://github.com/BinomialLLC/basis_universal/wiki/Release-Notes)
 
@@ -73,12 +75,17 @@
 
 The command line tool used to create, validate, and transcode/unpack .basis files is named "basisu". Run basisu without any parameters for help. 
 
-To build basisu:
+To build basisu (without SSE 4.1 support - the default):
 
 ```
 cmake CMakeLists.txt
 make
 ```
+To build with SSE 4.1 support on x86/x64 systems (encoding is roughly 15-30% faster):
+```
+cmake -D SSE=TRUE CMakeLists.txt
+make
+```
 
 For Visual Studio 2019, you can now either use the CMakeLists.txt file or the included `basisu.sln` file.
 
@@ -138,9 +145,10 @@
 
 To compress small video sequences, say using tools like ffmpeg and VirtualDub:
 
-`basisu -comp_level 1 -tex_type video -stats -debug -multifile_printf "pic%04u.png" -multifile_num 200 -multifile_first 1 -max_selectors 16128 -max_endpoints 16128 -endpoint_rdo_thresh 1.05 -selector_rdo_thresh 1.05`
+`basisu -comp_level 2 -tex_type video -stats -debug -multifile_printf "pic%04u.png" -multifile_num 200 -multifile_first 1 -max_selectors 16128 -max_endpoints 16128 -endpoint_rdo_thresh 1.05 -selector_rdo_thresh 1.05`
 
 The reference encoder will take a LONG time and a lot of CPU to encode video. The more cores your machine has, the better. Basis is intended for smaller videos of a few dozen seconds or so. If you are very patient and have a Threadripper or Xeon workstation, you should be able to encode up to a few thousand 720P frames. The "webgl_videotest" directory contains a very simple video viewer.
+For texture video, use -comp_level 2 or 3. The default is 1, which isn't quite good enough for texture video.
 
 The .basis file will contain multiple images (all using the same global codebooks), which you can retrieve using the transcoder's image API. The system now supports [conditional replenisment](https://en.wikipedia.org/wiki/MPEG-1) (CR, or "skip blocks"). CR can reduce the bitrate of some videos (highly dependent on how dynamic the content is) by over 50%. For videos using CR, the images must be requested from the transcoder in sequence from first to last, and random access is only allowed to I-Frames. 
 
@@ -182,7 +190,7 @@
 `basisu -linear -global_sel_pal -file x.png`\
 Compress a non-sRGB image, use hybrid selector codebooks for slightly improved compression (but slower encoding)
 
-`basisu -tex_type video -framerate 20 -multifile_printf "x%02u.png" -multifile_first 1 -multifile_count 20 -selector_rdo_thresh 1.05 -endpoint_rdo_thresh 1.05`\
+`basisu -tex_type video -comp_level 2 -framerate 20 -multifile_printf "x%02u.png" -multifile_first 1 -multifile_count 20 -selector_rdo_thresh 1.05 -endpoint_rdo_thresh 1.05`\
 Compress a 20 sRGB source image video sequence (x01.png, x02.png, x03.png, etc.) to x01.basis
 
 `basisu -comp_level 2 -q 255 -file x.png -mipmap -y_flip`\
diff --git a/basisu_tool.cpp b/basisu_tool.cpp
index aa168dc..2db9230 100644
--- a/basisu_tool.cpp
+++ b/basisu_tool.cpp
@@ -1,5 +1,5 @@
 // basisu_tool.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -28,12 +28,15 @@
 #include "transcoder/basisu_transcoder.h"
 #include "encoder/basisu_ssim.h"
 
+#define MINIZ_HEADER_FILE_ONLY
+#include "encoder/basisu_miniz.h"
+
 // Set BASISU_CATCH_EXCEPTIONS if you want exceptions to crash the app, otherwise main() catches them.
 #define BASISU_CATCH_EXCEPTIONS 0
 
 using namespace basisu;
 
-#define BASISU_TOOL_VERSION "1.12.00"
+#define BASISU_TOOL_VERSION "1.13"
 
 enum tool_mode
 {
@@ -44,7 +47,8 @@
 	cUnpack,
 	cCompare,
 	cVersion,
-	cBench
+	cBench,
+	cCompSize
 };
 
 static void print_usage()
@@ -83,13 +87,19 @@
 		" For video, the .basis file will be written with the first frame being an I-Frame, and subsequent frames being P-Frames (using conditional replenishment). Playback must always occur in order from first to last image.\n"
 		" -framerate X: Set framerate in header to X/frames sec.\n"
 		" -individual: Process input images individually and output multiple .basis files (not as a texture array)\n"
-		" -comp_level X: Set ETC1S encoding speed vs. quality tradeoff. Range is 0-5, default is 1. Higher values=MUCH slower, but slightly higher quality. Mostly intended for videos. Use -q first!\n"
+		" -comp_level X: Set ETC1S encoding speed vs. quality tradeoff. Range is 0-6, default is 1. Higher values=MUCH slower, but slightly higher quality. Higher levels intended for videos. Use -q first!\n"
 		" -fuzz_testing: Use with -validate: Disables CRC16 validation of file contents before transcoding\n"
 		"\nUASTC options:\n"
-		" -uastc: Enable UASTC texture mode, instead of the default ETC1S mode. Significantly higher texture quality, but larger files. (Note that .basis files must be losslessly compressed by the user.)\n"
+		" -uastc: Enable UASTC texture mode, instead of the default ETC1S mode. Significantly higher texture quality, but larger files. (Note that UASTC .basis files must be losslessly compressed by the user.)\n"
 		" -uastc_level: Set UASTC encoding level. Range is [0,4], default is 2, higher=slower but higher quality. 0=fastest/lowest quality, 3=slowest practical option, 4=impractically slow/highest achievable quality\n"
-		" -uastc_rdo_q X: Enable UASTC RDO post-processing and set UASTC RDO quality scalar to X. Lower values=higher quality/larger LZ compressed files, higher values=lower quality/smaller LZ compressed files. Good range to try is [.2-4].\n"
-		" -uastc_rdo_d X: Set UASTC RDO dictionary size in bytes. Default is 32768. Lower values=faster, but less compression.\n"
+		" -uastc_rdo_l X: Enable UASTC RDO post-processing and set UASTC RDO quality scalar (lambda) to X. Lower values=higher quality/larger LZ\n"
+		"                 compressed files, higher values=lower quality/smaller LZ compressed files. Good range to try is [.25-10].\n"
+		"                 Note: Previous versons used the -uastc_rdo_q option, which was removed because the RDO algorithm was changed.\n"
+		" -uastc_rdo_d X: Set UASTC RDO dictionary size in bytes. Default is 4096, max is 65536. Lower values=faster, but less compression.\n"
+		" -uastc_rdo_b X: Set UASTC RDO max smooth block error scale. Range is [1,300]. Default is 10.0, 1.0=disabled. Larger values suppress more artifacts (and allocate more bits) on smooth blocks.\n"
+		" -uastc_rdo_s X: Set UASTC RDO max smooth block standard deviation. Range is [.01,65536]. Default is 18.0. Larger values expand the range of blocks considered smooth.\n"
+		" -uastc_rdo_f: Don't favor simpler UASTC modes in RDO mode.\n"
+		" -uastc_rdo_m: Disable RDO multithreading (slightly higher compression, deterministic).\n"
 		"\n"
 		"More options:\n"
 		" -max_endpoints X: Manually set the max number of color endpoint clusters from 1-16128, use instead of -q\n"
@@ -107,6 +117,8 @@
 		" -disable_hierarchical_endpoint_codebooks: Disable hierarchical endpoint codebook usage, slower but higher quality on some compression levels\n"
 		" -compare_ssim: Compute and display SSIM of image comparison (slow)\n"
 		" -bench: UASTC benchmark mode, for development only\n"
+		" -resample_factor X: Resample all input textures by scale factor X using a box filter\n"
+		" -no_sse: Forbid all SSE instruction set usage\n"
 		"\n"
 		"Mipmap generation options:\n"
 		" -mipmap: Generate mipmaps for each source image\n"
@@ -116,6 +128,8 @@
 		" -mip_filter X: Set mipmap filter kernel, default is kaiser, filters: box, tent, bell, blackman, catmullrom, mitchell, etc.\n"
 		" -mip_renorm: Renormalize normal map to unit length vectors after filtering\n"
 		" -mip_clamp: Use clamp addressing on borders, instead of wrapping\n"
+		" -mip_fast: Use faster mipmap generation (resample from previous mip, not always first/largest mip level). The default (as of 1/2021)\n"
+		" -mip_slow: Always resample each mipmap level starting from the largest mipmap. Higher quality, but slower. Opposite of -mip_fast. Was the prior default before 1/2021.\n"
 		" -mip_smallest X: Set smallest pixel dimension for generated mipmaps, default is 1 pixel\n"
 		"By default, textures will be converted from sRGB to linear light before mipmap filtering, then back to sRGB (for the RGB color channels) unless -linear is specified.\n"
 		"You can override this behavior with -mip_srgb/-mip_linear.\n"
@@ -126,44 +140,37 @@
 		" -no_endpoint_rdo: Disable backend's endpoint rate distortion optimizations (slightly faster, less noisy output, but lower quality per output bit)\n"
 		" -endpoint_rdo_thresh X: Set endpoint RDO quality threshold, default is 1.5, lower is higher quality but less quality per output bit (try 1.0-3.0)\n"
 		"\n"
-		"Hierarchical virtual selector codebook options:\n"
-		" -global_sel_pal: Always use vitual selector palettes (instead of custom palettes), slightly smaller files, but lower quality, slower encoding\n"
-		" -auto_global_sel_pal: Automatically use virtual selector palettes on small images for slightly smaller files (defaults to off for faster encoding time)\n"
-		" -no_hybrid_sel_cb: Don't automatically use hybrid virtual selector codebooks (for higher quality, only active when -global_sel_pal is specified)\n"
-		" -global_pal_bits X: Set virtual selector codebook palette bits, range is [0,12], default is 8, higher is slower/better quality\n"
-		" -global_mod_bits X: Set virtual selector codebook modifier bits, range is [0,15], defualt is 8, higher is slower/better quality\n"
-		" -hybrid_sel_cb_quality_thresh X: Set hybrid selector codebook quality threshold, default is 2.0, try 1.5-3, higher is lower quality/smaller codebooks\n"
-		"\n"
 		"Set various fields in the Basis file header:\n"
 		" -userdata0 X: Set 32-bit userdata0 field in Basis file header to X (X is a signed 32-bit int)\n"
 		" -userdata1 X: Set 32-bit userdata1 field in Basis file header to X (X is a signed 32-bit int)\n"
 		"\n"
 		"Various command line examples:\n"
-		" basisu x.png : Compress sRGB image x.png to x.basis using default settings (multiple filenames OK)\n"
+		" basisu x.png : Compress sRGB image x.png to x.basis using default settings (multiple filenames OK, use -individual if you don't want a tex array)\n"
 		" basisu x.basis : Unpack x.basis to PNG/KTX files (multiple filenames OK)\n"
 		" basisu -file x.png -mipmap -y_flip : Compress a mipmapped x.basis file from an sRGB image named x.png, Y flip each source image\n"
 		" basisu -validate -file x.basis : Validate x.basis (check header, check file CRC's, attempt to transcode all slices)\n"
 		" basisu -unpack -file x.basis : Validates, transcodes and unpacks x.basis to mipmapped .KTX and RGB/A .PNG files (transcodes to all supported GPU texture formats)\n"
 		" basisu -q 255 -file x.png -mipmap -debug -stats : Compress sRGB x.png to x.basis at quality level 255 with compressor debug output/statistics\n"
 		" basisu -linear -max_endpoints 16128 -max_selectors 16128 -file x.png : Compress non-sRGB x.png to x.basis using the largest supported manually specified codebook sizes\n"
-		" basisu -linear -global_sel_pal -no_hybrid_sel_cb -file x.png : Compress a non-sRGB image, use virtual selector codebooks for improved compression (but slower encoding)\n"
-		" basisu -linear -global_sel_pal -file x.png: Compress a non-sRGB image, use hybrid selector codebooks for slightly improved compression (but slower encoding)\n"
-		" basisu -tex_type video -framerate 20 -multifile_printf \"x%02u.png\" -multifile_first 1 -multifile_count 20 : Compress a 20 sRGB source image video sequence (x01.png, x02.png, x03.png, etc.) to x01.basis\n"
+		" basisu -comp_level 2 -max_selectors 8192 -max_endpoints 8192 -tex_type video -framerate 20 -multifile_printf \"x%02u.png\" -multifile_first 1 -multifile_count 20 : Compress a 20 sRGB source image video sequence (x01.png, x02.png, x03.png, etc.) to x01.basis\n"
 		"\n"
-		"Note: For video use, it's recommended you use a very powerful machine with many cores. Use -slower for better codebook generation, specify very large codebooks using -max_endpoints and -max_selectors, and reduce\n"
-		"the default endpoint RDO threshold (-endpoint_rdo_thresh) to around 1.25. Videos may have mipmaps and alpha channels. Videos must always be played back by the transcoder in first to last image order.\n"
+		"Note: For video use, it's recommended you use a very powerful machine with many cores. Use -comp_level 2 or higher for better codebook\n"
+		"generation, specify very large codebooks using -max_endpoints and -max_selectors, and reduce the default endpoint RDO threshold\n"
+		"(-endpoint_rdo_thresh) to around 1.25. Videos may have mipmaps and alpha channels. Videos must always be played back by the transcoder\n"
+		"in first to last image order.\n"
 		"Video files currently use I-Frames on the first image, and P-Frames using conditional replenishment on subsequent frames.\n"
-		"Compression level details:\n"
-		" Level 0: Fastest, but has marginal quality and is a work in progress. Brittle on complex images. Avg. Y dB: 35.45\n"
-		" Level 1: Hierarchical codebook searching. 36.87 dB, ~1.4x slower vs. level 0. (This is the default setting.)\n"
-		" Level 2: Full codebook searching. 37.13 dB, ~1.8x slower vs. level 0. (Equivalent the the initial release's default settings.)\n"
-		" Level 3: Hierarchical codebook searching, codebook k-means iterations. 37.15 dB, ~4x slower vs. level 0\n"
-		" Level 4: Full codebook searching, codebook k-means iterations. 37.41 dB, ~5.5x slower vs. level 0. (Equivalent to the initial release's -slower setting.)\n"
-		" Level 5: Full codebook searching, twice as many codebook k-means iterations, best ETC1 endpoint opt. 37.43 dB, ~12x slower vs. level 0\n"
+		"\nCompression level (-comp_level X) details:\n"
+		" Level 0: Fastest, but has marginal quality and can be brittle on complex images. Avg. Y dB: 35.45\n"
+		" Level 1: Hierarchical codebook searching, faster ETC1S encoding. 36.87 dB, ~1.4x slower vs. level 0. (This is the default setting.)\n"
+		" Level 2: Use this or higher for video. Hierarchical codebook searching. 36.87 dB, ~1.4x slower vs. level 0. (This is the v1.12's default setting.)\n"
+		" Level 3: Full codebook searching. 37.13 dB, ~1.8x slower vs. level 0. (Equivalent the the initial release's default settings.)\n"
+		" Level 4: Hierarchical codebook searching, codebook k-means iterations. 37.15 dB, ~4x slower vs. level 0\n"
+		" Level 5: Full codebook searching, codebook k-means iterations. 37.41 dB, ~5.5x slower vs. level 0. (Equivalent to the initial release's -slower setting.)\n"
+		" Level 6: Full codebook searching, twice as many codebook k-means iterations, best ETC1 endpoint opt. 37.43 dB, ~12x slower vs. level 0\n"
 	);
 }
 
-static bool load_listing_file(const std::string &f, std::vector<std::string> &filenames)
+static bool load_listing_file(const std::string &f, basisu::vector<std::string> &filenames)
 {
 	std::string filename(f);
 	filename.erase(0, 1);
@@ -250,6 +257,7 @@
 		m_compare_ssim(false),
 		m_bench(false)
 	{
+		m_comp_params.m_compression_level = std::max<int>(0, BASISU_DEFAULT_COMPRESSION_LEVEL - 1);
 	}
 
 	bool parse(int arg_c, const char **arg_v)
@@ -279,6 +287,14 @@
 				m_compare_ssim = true;
 			else if (strcasecmp(pArg, "-bench") == 0)
 				m_mode = cBench;
+			else if (strcasecmp(pArg, "-comp_size") == 0)
+				m_mode = cCompSize;
+			else if (strcasecmp(pArg, "-no_sse") == 0)
+			{
+#if BASISU_SUPPORT_SSE
+				g_cpu_supports_sse41 = false;
+#endif
+			}
 			else if (strcasecmp(pArg, "-file") == 0)
 			{
 				REMAINING_ARGS_CHECK(1);
@@ -327,7 +343,13 @@
 				
 				arg_count++;
 			}
-			else if (strcasecmp(pArg, "-uastc_rdo_q") == 0)
+			else if (strcasecmp(pArg, "-resample_factor") == 0)
+			{
+				REMAINING_ARGS_CHECK(1);
+				m_comp_params.m_resample_factor = (float)atof(arg_v[arg_index + 1]);
+				arg_count++;
+			}
+			else if (strcasecmp(pArg, "-uastc_rdo_l") == 0)
 			{
 				REMAINING_ARGS_CHECK(1);
 				m_comp_params.m_rdo_uastc_quality_scalar = (float)atof(arg_v[arg_index + 1]);
@@ -340,6 +362,22 @@
 				m_comp_params.m_rdo_uastc_dict_size = atoi(arg_v[arg_index + 1]);
 				arg_count++;
 			}
+			else if (strcasecmp(pArg, "-uastc_rdo_b") == 0)
+			{
+				REMAINING_ARGS_CHECK(1);
+				m_comp_params.m_rdo_uastc_max_smooth_block_error_scale = (float)atof(arg_v[arg_index + 1]);
+				arg_count++;
+			}
+			else if (strcasecmp(pArg, "-uastc_rdo_s") == 0)
+			{
+				REMAINING_ARGS_CHECK(1);
+				m_comp_params.m_rdo_uastc_smooth_block_max_std_dev = (float)atof(arg_v[arg_index + 1]);
+				arg_count++;
+			}
+			else if (strcasecmp(pArg, "-uastc_rdo_f") == 0)
+				m_comp_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode = false;
+			else if (strcasecmp(pArg, "-uastc_rdo_m") == 0)
+				m_comp_params.m_rdo_uastc_multithreading = false;
 			else if (strcasecmp(pArg, "-linear") == 0)
 				m_comp_params.m_perceptual = false;
 			else if (strcasecmp(pArg, "-srgb") == 0)
@@ -380,7 +418,7 @@
 			else if (strcasecmp(pArg, "-slower") == 0)
 			{
 				// This option is gone, but we'll do something reasonable with it anyway. Level 4 is equivalent to the original release's -slower, but let's just go to level 2.
-				m_comp_params.m_compression_level = 2;
+				m_comp_params.m_compression_level = BASISU_DEFAULT_COMPRESSION_LEVEL + 1;
 			}
 			else if (strcasecmp(pArg, "-max_endpoints") == 0)
 			{
@@ -473,6 +511,10 @@
 				m_comp_params.m_mip_renormalize = true;
 			else if (strcasecmp(pArg, "-mip_clamp") == 0)
 				m_comp_params.m_mip_wrapping = false;
+			else if (strcasecmp(pArg, "-mip_fast") == 0)
+				m_comp_params.m_mip_fast = true;
+			else if (strcasecmp(pArg, "-mip_slow") == 0)
+				m_comp_params.m_mip_fast = false;
 			else if (strcasecmp(pArg, "-mip_smallest") == 0)
 			{
 				REMAINING_ARGS_CHECK(1);
@@ -623,7 +665,7 @@
 
 	bool process_listing_files()
 	{
-		std::vector<std::string> new_input_filenames;
+		basisu::vector<std::string> new_input_filenames;
 		for (uint32_t i = 0; i < m_input_filenames.size(); i++)
 		{
 			if (m_input_filenames[i][0] == '@')
@@ -636,7 +678,7 @@
 		}
 		new_input_filenames.swap(m_input_filenames);
 
-		std::vector<std::string> new_input_alpha_filenames;
+		basisu::vector<std::string> new_input_alpha_filenames;
 		for (uint32_t i = 0; i < m_input_alpha_filenames.size(); i++)
 		{
 			if (m_input_alpha_filenames[i][0] == '@')
@@ -656,8 +698,8 @@
 		
 	tool_mode m_mode;
 		
-	std::vector<std::string> m_input_filenames;
-	std::vector<std::string> m_input_alpha_filenames;
+	basisu::vector<std::string> m_input_filenames;
+	basisu::vector<std::string> m_input_alpha_filenames;
 
 	std::string m_output_filename;
 	std::string m_output_path;
@@ -751,17 +793,23 @@
 	FILE *pCSV_file = nullptr;
 	if (opts.m_csv_file.size())
 	{
-		pCSV_file = fopen_safe(opts.m_csv_file.c_str(), "a");
+		//pCSV_file = fopen_safe(opts.m_csv_file.c_str(), "a");
+		pCSV_file = fopen_safe(opts.m_csv_file.c_str(), "w");
 		if (!pCSV_file)
 		{
 			error_printf("Failed opening CVS file \"%s\"\n", opts.m_csv_file.c_str());
 			return false;
 		}
+		fprintf(pCSV_file, "Filename, Size, Slices, Width, Height, HasAlpha, BitsPerTexel, Slice0RGBAvgPSNR, Slice0RGBAAvgPSNR, Slice0Luma709PSNR, Slice0BestETC1SLuma709PSNR, Q, CL, Time, RGBAvgPSNRMin, RGBAvgPSNRAvg, AAvgPSNRMin, AAvgPSNRAvg, Luma709PSNRMin, Luma709PSNRAvg\n");
 	}
 
 	printf("Processing %u total file(s)\n", (uint32_t)opts.m_input_filenames.size());
 				
-	for (size_t file_index = 0; file_index < (opts.m_individual ? opts.m_input_filenames.size() : 1U); file_index++)
+	interval_timer all_tm;
+	all_tm.start();
+
+	const size_t total_files = (opts.m_individual ? opts.m_input_filenames.size() : 1U);
+	for (size_t file_index = 0; file_index < total_files; file_index++)
 	{
 		if (opts.m_individual)
 		{
@@ -828,7 +876,7 @@
 
 		if (ec == basis_compressor::cECSuccess)
 		{
-			printf("Compression succeeded to file \"%s\" in %3.3f secs\n", params.m_out_filename.c_str(), tm.get_elapsed_secs());
+			printf("Compression succeeded to file \"%s\" size %i bytes in %3.3f secs\n", params.m_out_filename.c_str(), (int)c.get_output_basis_file().size(), tm.get_elapsed_secs());
 		}
 		else
 		{
@@ -888,6 +936,7 @@
 
 		if ((pCSV_file) && (c.get_stats().size()))
 		{
+#if 0
 			for (size_t slice_index = 0; slice_index < c.get_stats().size(); slice_index++)
 			{
 				fprintf(pCSV_file, "\"%s\", %u, %u, %u, %u, %u, %f, %f, %f, %f, %f, %u, %u, %f\n",
@@ -902,6 +951,46 @@
 					params.m_quality_level, (int)params.m_compression_level, tm.get_elapsed_secs());
 				fflush(pCSV_file);
 			}
+#else
+			if (c.get_stats().size())
+			{
+				float rgb_avg_psnr_min = 1e+9f, rgb_avg_psnr_avg = 0.0f;
+				float a_avg_psnr_min = 1e+9f, a_avg_psnr_avg = 0.0f;
+				float luma_709_psnr_min = 1e+9f, luma_709_psnr_avg = 0.0f;
+
+				for (size_t slice_index = 0; slice_index < c.get_stats().size(); slice_index++)
+				{
+					rgb_avg_psnr_min = std::min(rgb_avg_psnr_min, c.get_stats()[slice_index].m_basis_rgb_avg_psnr);
+					rgb_avg_psnr_avg += c.get_stats()[slice_index].m_basis_rgb_avg_psnr;
+
+					a_avg_psnr_min = std::min(a_avg_psnr_min, c.get_stats()[slice_index].m_basis_a_avg_psnr);
+					a_avg_psnr_avg += c.get_stats()[slice_index].m_basis_a_avg_psnr;
+
+					luma_709_psnr_min = std::min(luma_709_psnr_min, c.get_stats()[slice_index].m_basis_luma_709_psnr);
+					luma_709_psnr_avg += c.get_stats()[slice_index].m_basis_luma_709_psnr;
+				}
+
+				rgb_avg_psnr_avg /= c.get_stats().size();
+				a_avg_psnr_avg /= c.get_stats().size();
+				luma_709_psnr_avg /= c.get_stats().size();
+				
+				fprintf(pCSV_file, "\"%s\", %u, %u, %u, %u, %u, %f, %f, %f, %f, %f, %u, %u, %f, %f, %f, %f, %f, %f, %f\n",
+					params.m_out_filename.c_str(),
+					c.get_basis_file_size(),
+					(uint32_t)c.get_stats().size(),
+					c.get_stats()[0].m_width, c.get_stats()[0].m_height, (uint32_t)c.get_any_source_image_has_alpha(),
+					c.get_basis_bits_per_texel(),
+					c.get_stats()[0].m_basis_rgb_avg_psnr,
+					c.get_stats()[0].m_basis_rgba_avg_psnr,
+					c.get_stats()[0].m_basis_luma_709_psnr,
+					c.get_stats()[0].m_best_etc1s_luma_709_psnr,
+					params.m_quality_level, (int)params.m_compression_level, tm.get_elapsed_secs(),
+					rgb_avg_psnr_min, rgb_avg_psnr_avg,
+					a_avg_psnr_min, a_avg_psnr_avg,
+					luma_709_psnr_min, luma_709_psnr_avg);
+				fflush(pCSV_file);
+			}
+#endif
 		}
 				
 		if (opts.m_individual)
@@ -909,6 +998,11 @@
 
 	} // file_index
 
+	all_tm.stop();
+
+	if (total_files > 1)
+		printf("Total compression time: %3.3f secs\n", all_tm.get_elapsed_secs());
+
 	if (pCSV_file)
 	{
 		fclose(pCSV_file);
@@ -929,6 +1023,18 @@
 		return false;
 	}
 
+	FILE* pCSV_file = nullptr;
+	if ((opts.m_csv_file.size()) && (opts.m_mode == cValidate))
+	{
+		pCSV_file = fopen_safe(opts.m_csv_file.c_str(), "w");
+		if (!pCSV_file)
+		{
+			error_printf("Failed opening CVS file \"%s\"\n", opts.m_csv_file.c_str());
+			return false;
+		}
+		//fprintf(pCSV_file, "Filename, Size, Slices, Width, Height, HasAlpha, BitsPerTexel, Slice0RGBAvgPSNR, Slice0RGBAAvgPSNR, Slice0Luma709PSNR, Slice0BestETC1SLuma709PSNR, Q, CL, Time, RGBAvgPSNRMin, RGBAvgPSNRAvg, AAvgPSNRMin, AAvgPSNRAvg, Luma709PSNRMin, Luma709PSNRAvg\n");
+	}
+
 	uint32_t total_unpack_warnings = 0;
 	uint32_t total_pvrtc_nonpow2_warnings = 0;
 
@@ -943,6 +1049,7 @@
 		if (!basisu::read_file_to_vec(pInput_filename, basis_data))
 		{
 			error_printf("Failed reading file \"%s\"\n", pInput_filename);
+			if (pCSV_file) fclose(pCSV_file);
 			return false;
 		}
 
@@ -951,15 +1058,17 @@
 		if (!basis_data.size())
 		{
 			error_printf("File is empty!\n");
+			if (pCSV_file) fclose(pCSV_file);
 			return false;
 		}
 
 		if (basis_data.size() > UINT32_MAX)
 		{
 			error_printf("File is too large!\n");
+			if (pCSV_file) fclose(pCSV_file);
 			return false;
 		}
-
+				
 		basist::basisu_transcoder dec(&sel_codebook);
 
 		if (!opts.m_fuzz_testing)
@@ -970,6 +1079,7 @@
 			if (!dec.validate_file_checksums(&basis_data[0], (uint32_t)basis_data.size(), true))
 			{
 				error_printf("File version is unsupported, or file fail CRC checks!\n");
+				if (pCSV_file) fclose(pCSV_file);
 				return false;
 			}
 		}
@@ -980,6 +1090,7 @@
 		if (!dec.get_file_info(&basis_data[0], (uint32_t)basis_data.size(), fileinfo))
 		{
 			error_printf("Failed retrieving Basis file information!\n");
+			if (pCSV_file) fclose(pCSV_file);
 			return false;
 		}
 				
@@ -1006,6 +1117,8 @@
 		for (uint32_t i = 0; i < fileinfo.m_total_images; i++)
 			printf("%u ", fileinfo.m_image_mipmap_levels[i]);
 		printf("\n");
+		
+		uint32_t total_texels = 0;
 
 		printf("\nImage info:\n");
 		for (uint32_t i = 0; i < fileinfo.m_total_images; i++)
@@ -1014,14 +1127,18 @@
 			if (!dec.get_image_info(&basis_data[0], (uint32_t)basis_data.size(), ii, i))
 			{
 				error_printf("get_image_info() failed!\n");
+				if (pCSV_file) fclose(pCSV_file);
 				return false;
 			}
 
 			printf("Image %u: MipLevels: %u OrigDim: %ux%u, BlockDim: %ux%u, FirstSlice: %u, HasAlpha: %u\n", i, ii.m_total_levels, ii.m_orig_width, ii.m_orig_height,
 				ii.m_num_blocks_x, ii.m_num_blocks_y, ii.m_first_slice_index, (uint32_t)ii.m_alpha_flag);
+
+			total_texels += ii.m_width * ii.m_height;
 		}
 
 		printf("\nSlice info:\n");
+				
 		for (uint32_t i = 0; i < fileinfo.m_slice_info.size(); i++)
 		{
 			const basist::basisu_slice_info& sliceinfo = fileinfo.m_slice_info[i];
@@ -1038,20 +1155,39 @@
 		}
 		printf("\n");
 
+		size_t comp_size = 0;
+		void* pComp_data = tdefl_compress_mem_to_heap(&basis_data[0], basis_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);// TDEFL_DEFAULT_MAX_PROBES);
+		mz_free(pComp_data);
+
+		const float basis_bits_per_texel = basis_data.size() * 8.0f / total_texels;
+		const float comp_bits_per_texel = comp_size * 8.0f / total_texels;
+		
+		printf("Original size: %u, bits per texel: %3.3f\nCompressed size (Deflate): %u, bits per texel: %3.3f\n", (uint32_t)basis_data.size(), basis_bits_per_texel, (uint32_t)comp_size, comp_bits_per_texel);
+
 		if (opts.m_mode == cInfo)
+		{
+			if (pCSV_file) fclose(pCSV_file);
 			return true;
+		}
+
 		interval_timer tm;
 		tm.start();
 
 		if (!dec.start_transcoding(&basis_data[0], (uint32_t)basis_data.size()))
 		{
 			error_printf("start_transcoding() failed!\n");
+			if (pCSV_file) fclose(pCSV_file);
 			return false;
 		}
 
-		printf("start_transcoding time: %3.3f ms\n", tm.get_elapsed_ms());
+		const double start_transcoding_time_ms = tm.get_elapsed_ms();
+
+		printf("start_transcoding time: %3.3f ms\n", start_transcoding_time_ms);
 				
-		std::vector< gpu_image_vec > gpu_images[(int)basist::transcoder_texture_format::cTFTotalTextureFormats];
+		basisu::vector< gpu_image_vec > gpu_images[(int)basist::transcoder_texture_format::cTFTotalTextureFormats];
+		
+		double total_format_transcoding_time_ms[(int)basist::transcoder_texture_format::cTFTotalTextureFormats];
+		clear_obj(total_format_transcoding_time_ms);
 		
 		int first_format = 0;
 		int last_format = (int)basist::transcoder_texture_format::cTFTotalTextureFormats;
@@ -1062,6 +1198,26 @@
 			last_format = first_format + 1;
 		}
 
+		if ((pCSV_file) && (file_index == 0))
+		{
+			std::string desc;
+			desc = "filename,basis_bitrate,comp_bitrate,images,levels,slices,start_transcoding_time,";
+			for (int format_iter = first_format; format_iter < last_format; format_iter++)
+			{
+				const basist::transcoder_texture_format transcoder_tex_fmt = static_cast<basist::transcoder_texture_format>(format_iter);
+
+				if (!basis_is_format_supported(transcoder_tex_fmt, fileinfo.m_tex_format))
+					continue;
+				if (transcoder_tex_fmt == basist::transcoder_texture_format::cTFBC7_ALT)
+					continue;
+
+				desc += std::string(basis_get_format_name(transcoder_tex_fmt));
+				if (format_iter != last_format - 1) 
+					desc += ",";
+			}
+			fprintf(pCSV_file, "%s\n", desc.c_str());
+		}
+
 		for (int format_iter = first_format; format_iter < last_format; format_iter++)
 		{
 			basist::transcoder_texture_format tex_fmt = static_cast<basist::transcoder_texture_format>(format_iter);
@@ -1102,6 +1258,7 @@
 					if (!dec.get_image_level_info(&basis_data[0], (uint32_t)basis_data.size(), level_info, image_index, level_index))
 					{
 						error_printf("Failed retrieving image level information (%u %u)!\n", image_index, level_index);
+						if (pCSV_file) fclose(pCSV_file);
 						return false;
 					}
 										
@@ -1133,11 +1290,14 @@
 					if (!dec.transcode_image_level(&basis_data[0], (uint32_t)basis_data.size(), image_index, level_index, gi.get_ptr(), gi.get_total_blocks(), transcoder_tex_fmt, decode_flags))
 					{
 						error_printf("Failed transcoding image level (%u %u %u)!\n", image_index, level_index, format_iter);
+						if (pCSV_file) fclose(pCSV_file);
 						return false;
 					}
 					
 					double total_transcode_time = tm.get_elapsed_ms();
 
+					total_format_transcoding_time_ms[format_iter] += total_transcode_time;
+
 					printf("Transcode of image %u level %u res %ux%u format %s succeeded in %3.3f ms\n", image_index, level_index, level_info.m_orig_width, level_info.m_orig_height, basist::basis_get_format_name(transcoder_tex_fmt), total_transcode_time);
 
 				} // format_iter
@@ -1166,7 +1326,7 @@
 					// No KTX tool that we know of supports cubemap arrays, so write individual cubemap files.
 					for (uint32_t image_index = 0; image_index < fileinfo.m_total_images; image_index += 6)
 					{
-						std::vector<gpu_image_vec> cubemap;
+						basisu::vector<gpu_image_vec> cubemap;
 						for (uint32_t i = 0; i < 6; i++)
 							cubemap.push_back(gpu_images[format_iter][image_index + i]);
 
@@ -1174,6 +1334,7 @@
 						if (!write_compressed_texture_file(ktx_filename.c_str(), cubemap, true))
 						{
 							error_printf("Failed writing KTX file \"%s\"!\n", ktx_filename.c_str());
+							if (pCSV_file) fclose(pCSV_file);
 							return false;
 						}
 						printf("Wrote KTX file \"%s\"\n", ktx_filename.c_str());
@@ -1201,6 +1362,7 @@
 						if (!write_compressed_texture_file(ktx_filename.c_str(), gi))
 						{
 							error_printf("Failed writing KTX file \"%s\"!\n", ktx_filename.c_str());
+							if (pCSV_file) fclose(pCSV_file);
 							return false;
 						}
 						printf("Wrote KTX file \"%s\"\n", ktx_filename.c_str());
@@ -1213,6 +1375,7 @@
 						if (!dec.get_image_level_info(&basis_data[0], (uint32_t)basis_data.size(), level_info, image_index, level_index))
 						{
 							error_printf("Failed retrieving image level information (%u %u)!\n", image_index, level_index);
+							if (pCSV_file) fclose(pCSV_file);
 							return false;
 						}
 
@@ -1246,6 +1409,7 @@
 							if (!write_3dfx_out_file(out_filename.c_str(), gi[level_index]))
 							{
 								error_printf("Failed writing to OUT file \"%s\"\n", out_filename.c_str());
+								if (pCSV_file) fclose(pCSV_file);
 								return false;
 							}
 							printf("Wrote .OUT file \"%s\"\n", out_filename.c_str());
@@ -1261,6 +1425,7 @@
 							if (!save_png(a_filename, u, cImageSaveGrayscale, 3))
 							{
 								error_printf("Failed writing to PNG file \"%s\"\n", a_filename.c_str());
+								if (pCSV_file) fclose(pCSV_file);
 								return false;
 							}
 							printf("Wrote PNG file \"%s\"\n", a_filename.c_str());
@@ -1273,6 +1438,7 @@
 							if (!save_png(rgba_filename, u))
 							{
 								error_printf("Failed writing to PNG file \"%s\"\n", rgba_filename.c_str());
+								if (pCSV_file) fclose(pCSV_file);
 								return false;
 							}
 							printf("Wrote PNG file \"%s\"\n", rgba_filename.c_str());
@@ -1298,6 +1464,7 @@
 				if (!dec.get_image_level_info(&basis_data[0], (uint32_t)basis_data.size(), level_info, image_index, level_index))
 				{
 					error_printf("Failed retrieving image level information (%u %u)!\n", image_index, level_index);
+					if (pCSV_file) fclose(pCSV_file);
 					return false;
 				}
 
@@ -1310,30 +1477,35 @@
 				if (!dec.transcode_image_level(&basis_data[0], (uint32_t)basis_data.size(), image_index, level_index, &img(0, 0).r, img.get_total_pixels(), transcoder_tex_fmt, 0, img.get_pitch(), nullptr, img.get_height()))
 				{
 					error_printf("Failed transcoding image level (%u %u %u)!\n", image_index, level_index, transcoder_tex_fmt);
+					if (pCSV_file) fclose(pCSV_file);
 					return false;
 				}
 
 				double total_transcode_time = tm.get_elapsed_ms();
+
+				total_format_transcoding_time_ms[(int)transcoder_tex_fmt] += total_transcode_time;
 								
 				printf("Transcode of image %u level %u res %ux%u format %s succeeded in %3.3f ms\n", image_index, level_index, level_info.m_orig_width, level_info.m_orig_height, basist::basis_get_format_name(transcoder_tex_fmt), total_transcode_time);
 
 				if (!validate_flag)
 				{
-				std::string rgb_filename(base_filename + string_format("_unpacked_rgb_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
-				if (!save_png(rgb_filename, img, cImageSaveIgnoreAlpha))
-				{
-					error_printf("Failed writing to PNG file \"%s\"\n", rgb_filename.c_str());
-					return false;
-				}
-				printf("Wrote PNG file \"%s\"\n", rgb_filename.c_str());
+					std::string rgb_filename(base_filename + string_format("_unpacked_rgb_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
+					if (!save_png(rgb_filename, img, cImageSaveIgnoreAlpha))
+					{
+						error_printf("Failed writing to PNG file \"%s\"\n", rgb_filename.c_str());
+						if (pCSV_file) fclose(pCSV_file);
+						return false;
+					}
+					printf("Wrote PNG file \"%s\"\n", rgb_filename.c_str());
 
-				std::string a_filename(base_filename + string_format("_unpacked_a_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
-				if (!save_png(a_filename, img, cImageSaveGrayscale, 3))
-				{
-					error_printf("Failed writing to PNG file \"%s\"\n", a_filename.c_str());
-					return false;
-				}
-				printf("Wrote PNG file \"%s\"\n", a_filename.c_str());
+					std::string a_filename(base_filename + string_format("_unpacked_a_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
+					if (!save_png(a_filename, img, cImageSaveGrayscale, 3))
+					{
+						error_printf("Failed writing to PNG file \"%s\"\n", a_filename.c_str());
+						if (pCSV_file) fclose(pCSV_file);
+						return false;
+					}
+					printf("Wrote PNG file \"%s\"\n", a_filename.c_str());
 				}
 
 			} // level_index
@@ -1351,10 +1523,11 @@
 				if (!dec.get_image_level_info(&basis_data[0], (uint32_t)basis_data.size(), level_info, image_index, level_index))
 				{
 					error_printf("Failed retrieving image level information (%u %u)!\n", image_index, level_index);
+					if (pCSV_file) fclose(pCSV_file);
 					return false;
 				}
 
-				std::vector<uint16_t> packed_img(level_info.m_orig_width * level_info.m_orig_height);
+				basisu::vector<uint16_t> packed_img(level_info.m_orig_width * level_info.m_orig_height);
 
 				fill_buffer_with_random_bytes(&packed_img[0], packed_img.size() * sizeof(uint16_t));
 
@@ -1363,11 +1536,14 @@
 				if (!dec.transcode_image_level(&basis_data[0], (uint32_t)basis_data.size(), image_index, level_index, &packed_img[0], (uint32_t)packed_img.size(), transcoder_tex_fmt, 0, level_info.m_orig_width, nullptr, level_info.m_orig_height))
 				{
 					error_printf("Failed transcoding image level (%u %u %u)!\n", image_index, level_index, transcoder_tex_fmt);
+					if (pCSV_file) fclose(pCSV_file);
 					return false;
 				}
 
 				double total_transcode_time = tm.get_elapsed_ms();
 
+				total_format_transcoding_time_ms[(int)transcoder_tex_fmt] += total_transcode_time;
+
 				image img(level_info.m_orig_width, level_info.m_orig_height);
 				for (uint32_t y = 0; y < level_info.m_orig_height; y++)
 				{
@@ -1386,23 +1562,27 @@
 
 				if (!validate_flag)
 				{
-				std::string rgb_filename(base_filename + string_format("_unpacked_rgb_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
-				if (!save_png(rgb_filename, img, cImageSaveIgnoreAlpha))
-				{
-					error_printf("Failed writing to PNG file \"%s\"\n", rgb_filename.c_str());
-					return false;
-				}
-				printf("Wrote PNG file \"%s\"\n", rgb_filename.c_str());
+					std::string rgb_filename(base_filename + string_format("_unpacked_rgb_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
+					if (!save_png(rgb_filename, img, cImageSaveIgnoreAlpha))
+					{
+						error_printf("Failed writing to PNG file \"%s\"\n", rgb_filename.c_str());
+						if (pCSV_file) fclose(pCSV_file);
+						return false;
+					}
+					printf("Wrote PNG file \"%s\"\n", rgb_filename.c_str());
 				}
 
 			} // level_index
 		} // image_index
 
 		// Now unpack to RGBA4444 using the transcoder itself to do the unpacking to raster images
+		uint32_t max_mipmap_levels = 0;
 		for (uint32_t image_index = 0; image_index < fileinfo.m_total_images; image_index++)
 		{
 			for (uint32_t level_index = 0; level_index < fileinfo.m_image_mipmap_levels[image_index]; level_index++)
 			{
+				max_mipmap_levels = std::max(max_mipmap_levels, fileinfo.m_image_mipmap_levels[image_index]);
+					 
 				const basist::transcoder_texture_format transcoder_tex_fmt = basist::transcoder_texture_format::cTFRGBA4444;
 
 				basist::basisu_image_level_info level_info;
@@ -1410,10 +1590,11 @@
 				if (!dec.get_image_level_info(&basis_data[0], (uint32_t)basis_data.size(), level_info, image_index, level_index))
 				{
 					error_printf("Failed retrieving image level information (%u %u)!\n", image_index, level_index);
+					if (pCSV_file) fclose(pCSV_file);
 					return false;
 				}
 
-				std::vector<uint16_t> packed_img(level_info.m_orig_width * level_info.m_orig_height);
+				basisu::vector<uint16_t> packed_img(level_info.m_orig_width * level_info.m_orig_height);
 
 				fill_buffer_with_random_bytes(&packed_img[0], packed_img.size() * sizeof(uint16_t));
 
@@ -1422,11 +1603,14 @@
 				if (!dec.transcode_image_level(&basis_data[0], (uint32_t)basis_data.size(), image_index, level_index, &packed_img[0], (uint32_t)packed_img.size(), transcoder_tex_fmt, 0, level_info.m_orig_width, nullptr, level_info.m_orig_height))
 				{
 					error_printf("Failed transcoding image level (%u %u %u)!\n", image_index, level_index, transcoder_tex_fmt);
+					if (pCSV_file) fclose(pCSV_file);
 					return false;
 				}
 
 				double total_transcode_time = tm.get_elapsed_ms();
 
+				total_format_transcoding_time_ms[(int)transcoder_tex_fmt] += total_transcode_time;
+
 				image img(level_info.m_orig_width, level_info.m_orig_height);
 				for (uint32_t y = 0; y < level_info.m_orig_height; y++)
 				{
@@ -1446,26 +1630,55 @@
 
 				if (!validate_flag)
 				{
-				std::string rgb_filename(base_filename + string_format("_unpacked_rgb_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
-				if (!save_png(rgb_filename, img, cImageSaveIgnoreAlpha))
-				{
-					error_printf("Failed writing to PNG file \"%s\"\n", rgb_filename.c_str());
-					return false;
-				}
-				printf("Wrote PNG file \"%s\"\n", rgb_filename.c_str());
+					std::string rgb_filename(base_filename + string_format("_unpacked_rgb_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
+					if (!save_png(rgb_filename, img, cImageSaveIgnoreAlpha))
+					{
+						error_printf("Failed writing to PNG file \"%s\"\n", rgb_filename.c_str());
+						if (pCSV_file) fclose(pCSV_file);
+						return false;
+					}
+					printf("Wrote PNG file \"%s\"\n", rgb_filename.c_str());
 
-				std::string a_filename(base_filename + string_format("_unpacked_a_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
-				if (!save_png(a_filename, img, cImageSaveGrayscale, 3))
-				{
-					error_printf("Failed writing to PNG file \"%s\"\n", a_filename.c_str());
-					return false;
-				}
-				printf("Wrote PNG file \"%s\"\n", a_filename.c_str());
+					std::string a_filename(base_filename + string_format("_unpacked_a_%s_%u_%04u.png", basist::basis_get_format_name(transcoder_tex_fmt), level_index, image_index));
+					if (!save_png(a_filename, img, cImageSaveGrayscale, 3))
+					{
+						error_printf("Failed writing to PNG file \"%s\"\n", a_filename.c_str());
+						if (pCSV_file) fclose(pCSV_file);
+						return false;
+					}
+					printf("Wrote PNG file \"%s\"\n", a_filename.c_str());
 				}
 
 			} // level_index
 		} // image_index
 
+		if (pCSV_file)
+		{
+			fprintf(pCSV_file, "%s, %3.3f, %3.3f, %u, %u, %u, %3.3f, ",
+				base_filename.c_str(),
+				basis_bits_per_texel,
+				comp_bits_per_texel,
+				fileinfo.m_total_images,
+				max_mipmap_levels,
+				(uint32_t)fileinfo.m_slice_info.size(),
+				start_transcoding_time_ms);
+
+			for (int format_iter = first_format; format_iter < last_format; format_iter++)
+			{
+				const basist::transcoder_texture_format transcoder_tex_fmt = static_cast<basist::transcoder_texture_format>(format_iter);
+
+				if (!basis_is_format_supported(transcoder_tex_fmt, fileinfo.m_tex_format))
+					continue;
+				if (transcoder_tex_fmt == basist::transcoder_texture_format::cTFBC7_ALT)
+					continue;
+
+				fprintf(pCSV_file, "%3.3f", total_format_transcoding_time_ms[format_iter]);
+				if (format_iter != (last_format - 1))
+					fprintf(pCSV_file, ",");
+			}
+			fprintf(pCSV_file, "\n");
+		}
+
 	} // file_index
 
 	if (total_pvrtc_nonpow2_warnings)
@@ -1476,6 +1689,12 @@
 	else
 		printf("Success\n");
 
+	if (pCSV_file)
+	{
+		fclose(pCSV_file);
+		pCSV_file = nullptr;
+	}
+
 	return true;
 }
 
@@ -1594,8 +1813,6 @@
 #include "encoder/basisu_astc_decomp.h"
 #include "encoder/basisu_pvrtc1_4.h"
 
-#define MINIZ_HEADER_FILE_ONLY
-#include "encoder/basisu_miniz.h"
 static bool bench_mode(command_line_params& opts)
 {
 #if 0
@@ -1777,7 +1994,7 @@
 		double total_bench_time = 0;
 		double total_bench2_time = 0;
 
-		std::vector<basist::uastc_block> ublocks(total_blocks);
+		basisu::vector<basist::uastc_block> ublocks(total_blocks);
 
 #if 0
 		astc_enc_settings astc_settings;
@@ -2149,7 +2366,7 @@
 			total_raw_size += ublocks.size() * 16;
 		}
 
-		std::vector<color_rgba> orig_block_pixels(ublocks.size() * 16);
+		basisu::vector<color_rgba> orig_block_pixels(ublocks.size() * 16);
 		for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 			for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
 				img.extract_block_clamped(&orig_block_pixels[(block_x + block_y * num_blocks_x) * 16], block_x * 4, block_y * 4, 4, 4);
@@ -2179,12 +2396,12 @@
 			}
 		}
 
-		for (float q = .2f; q <= 3.0f; q += (q >= 1.0f ? .5f : .1f))
+		for (float q = .2f; q <= 10.0f; q += (q >= 1.0f ? .5f : .1f))
 		{
 			printf("Q: %f\n", q);
 
 			uastc_rdo_params p;
-			p.m_quality_scaler = q;
+			p.m_lambda = q;
 			p.m_max_allowed_rms_increase_ratio = 10.0f;
 			p.m_skip_block_rms_thresh = 8.0f;
 			
@@ -2738,15 +2955,52 @@
 	return true;
 }
 
+static uint32_t compute_miniz_compressed_size(const char* pFilename, uint32_t &orig_size)
+{
+	orig_size = 0;
+
+	uint8_vec buf;
+	if (!read_file_to_vec(pFilename, buf))
+		return 0;
+
+	if (!buf.size())
+		return 0;
+
+	orig_size = buf.size();
+
+	size_t comp_size = 0;
+	void* pComp_data = tdefl_compress_mem_to_heap(&buf[0], buf.size(), &comp_size, TDEFL_MAX_PROBES_MASK);// TDEFL_DEFAULT_MAX_PROBES);
+
+	mz_free(pComp_data);
+
+	return (uint32_t)comp_size;
+}
+
+static bool compsize_mode(command_line_params& opts)
+{
+	if (opts.m_input_filenames.size() != 1)
+	{
+		error_printf("Must specify a filename using -file\n");
+		return false;
+	}
+
+	uint32_t orig_size;
+	uint32_t comp_size = compute_miniz_compressed_size(opts.m_input_filenames[0].c_str(), orig_size);
+	printf("Original file size: %u bytes\n", orig_size);
+	printf("miniz compressed size: %u bytes\n", comp_size);
+
+	return true;
+}
+
 static int main_internal(int argc, const char **argv)
 {
-	printf("Basis Universal GPU Texture Compressor Reference Encoder v" BASISU_TOOL_VERSION "\nCopyright (C) 2019-2020 Binomial LLC, All rights reserved\n");
+	printf("Basis Universal GPU Texture Compressor Reference Encoder v" BASISU_TOOL_VERSION "\nCopyright (C) 2019-2021 Binomial LLC, All rights reserved\n");
 
 	//interval_timer tm;
 	//tm.start();
 
 	basisu_encoder_init();
-
+		
 	//printf("Encoder and transcoder libraries initialized in %3.3f ms\n", tm.get_elapsed_ms());
 
 #if defined(DEBUG) || defined(_DEBUG)
@@ -2766,6 +3020,12 @@
 		return EXIT_FAILURE;
 	}
 
+#if BASISU_SUPPORT_SSE
+	printf("Using SSE 4.1: %u, Multithreading: %u\n", g_cpu_supports_sse41, (uint32_t)opts.m_comp_params.m_multithreading);
+#else
+	printf("Multithreading: %u\n", (uint32_t)opts.m_comp_params.m_multithreading);
+#endif
+
 	if (!opts.process_listing_files())
 		return EXIT_FAILURE;
 
@@ -2805,6 +3065,9 @@
 	case cBench:
 		status = bench_mode(opts);
 		break;
+	case cCompSize:
+		status = compsize_mode(opts);
+		break;
 	default:
 		assert(0);
 		break;
diff --git a/encoder/basisu_backend.cpp b/encoder/basisu_backend.cpp
index 4c130e9..5db04f0 100644
--- a/encoder/basisu_backend.cpp
+++ b/encoder/basisu_backend.cpp
@@ -1,5 +1,5 @@
 // basisu_backend.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,6 +17,11 @@
 //
 #include "basisu_backend.h"
 
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
 #define BASISU_FASTER_SELECTOR_REORDERING 0
 #define BASISU_BACKEND_VERIFY(c) verify(c, __LINE__);
 
@@ -178,9 +183,10 @@
 		basisu_frontend& r = *m_pFront_end;
 		//const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
 
-		if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0))
+		//if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0))
+		if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 1))
 		{
-			// We're changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed)
+			// We've changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed)
 			uint_vec new_block_endpoints(get_total_blocks());
 
 			for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
@@ -393,6 +399,7 @@
 
 		BASISU_BACKEND_VERIFY(total_invalid_crs == 0);
 	}
+
 	void basisu_backend::create_encoder_blocks()
 	{
 		basisu_frontend& r = *m_pFront_end;
@@ -662,7 +669,7 @@
 		histogram selector_histogram(r.get_total_selector_clusters() + basist::MAX_SELECTOR_HISTORY_BUF_SIZE + 1);
 		histogram selector_history_buf_rle_histogram(1 << basist::SELECTOR_HISTORY_BUF_RLE_COUNT_BITS);
 
-		std::vector<uint_vec> selector_syms(m_slices.size());
+		basisu::vector<uint_vec> selector_syms(m_slices.size());
 
 		const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = r.get_total_selector_clusters();
 		const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + basist::MAX_SELECTOR_HISTORY_BUF_SIZE;
@@ -672,7 +679,7 @@
 		histogram delta_endpoint_histogram(r.get_total_endpoint_clusters());
 
 		histogram endpoint_pred_histogram(basist::ENDPOINT_PRED_TOTAL_SYMBOLS);
-		std::vector<uint_vec> endpoint_pred_syms(m_slices.size());
+		basisu::vector<uint_vec> endpoint_pred_syms(m_slices.size());
 
 		uint32_t total_endpoint_indices_remapped = 0;
 
@@ -884,23 +891,32 @@
 						{
 							const pixel_block& src_pixels = r.get_source_pixel_block(block_index);
 
-							etc_block etc_blk(r.get_output_block(block_index));
+							const etc_block& etc_blk = r.get_output_block(block_index);
 
 							color_rgba etc_blk_unpacked[16];
 							unpack_etc1(etc_blk, etc_blk_unpacked);
 
 							uint64_t cur_err = 0;
-							for (uint32_t p = 0; p < 16; p++)
-								cur_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
-
+							if (r.get_params().m_perceptual)
+							{
+								for (uint32_t p = 0; p < 16; p++)
+									cur_err += color_distance(true, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+							}
+							else
+							{
+								for (uint32_t p = 0; p < 16; p++)
+									cur_err += color_distance(false, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+							}
+														
 							uint64_t best_trial_err = UINT64_MAX;
 							int best_trial_idx = 0;
 							uint32_t best_trial_history_buf_idx = 0;
 
-
 							const float selector_remap_thresh = maximum(1.0f, m_params.m_selector_rdo_quality_thresh); //2.5f;
 							const bool use_strict_search = (m_params.m_compression_level == 0) && (selector_remap_thresh == 1.0f);
 
+							const uint64_t limit_err = (uint64_t)ceilf(cur_err * selector_remap_thresh);
+							
 							for (uint32_t j = 0; j < selector_history_buf.size(); j++)
 							{
 								const int trial_idx = selector_history_buf[j];
@@ -917,30 +933,42 @@
 								}
 								else
 								{
-									for (uint32_t sy = 0; sy < 4; sy++)
-										for (uint32_t sx = 0; sx < 4; sx++)
-											etc_blk.set_selector(sx, sy, m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](sx, sy));
-
-									// TODO: Optimize this
-									unpack_etc1(etc_blk, etc_blk_unpacked);
-
 									uint64_t trial_err = 0;
-									const uint64_t thresh_err = minimum((uint64_t)ceilf(cur_err * selector_remap_thresh), best_trial_err);
-									for (uint32_t p = 0; p < 16; p++)
+									const uint64_t thresh_err = minimum(limit_err, best_trial_err);
+
+									color_rgba block_colors[4];
+									etc_blk.get_block_colors(block_colors, 0);
+
+									const uint8_t* pSelectors = &m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](0, 0);
+									
+									if (r.get_params().m_perceptual)
 									{
-										trial_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
-										if (trial_err > thresh_err)
-											break;
+										for (uint32_t p = 0; p < 16; p++)
+										{
+											uint32_t sel = pSelectors[p];
+											trial_err += color_distance(true, src_pixels.get_ptr()[p], block_colors[sel], false);
+											if (trial_err > thresh_err)
+												break;
+										}
+									}
+									else
+									{
+										for (uint32_t p = 0; p < 16; p++)
+										{
+											uint32_t sel = pSelectors[p];
+											trial_err += color_distance(false, src_pixels.get_ptr()[p], block_colors[sel], false);
+											if (trial_err > thresh_err)
+												break;
+										}
 									}
 
-									if (trial_err <= cur_err * selector_remap_thresh)
+									if ((trial_err < best_trial_err) && (trial_err <= thresh_err))
 									{
-										if (trial_err < best_trial_err)
-										{
-											best_trial_err = trial_err;
-											best_trial_idx = trial_idx;
-											best_trial_history_buf_idx = j;
-										}
+										assert(trial_err <= limit_err);
+										
+										best_trial_err = trial_err;
+										best_trial_idx = trial_idx;
+										best_trial_history_buf_idx = j;
 									}
 								}
 							}
@@ -1086,7 +1114,8 @@
 			total_selector_indices_remapped, total_selector_indices_remapped * 100.0f / get_total_blocks(),
 			total_used_selector_history_buf, total_used_selector_history_buf * 100.0f / get_total_blocks());
 
-		if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 0))
+		//if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 0))
+		if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 1))
 		{
 			int_vec unused;
 			r.reoptimize_remapped_endpoints(block_endpoint_indices, unused, false, &block_selector_indices);
diff --git a/encoder/basisu_backend.h b/encoder/basisu_backend.h
index e8518e2..0f9ca37 100644
--- a/encoder/basisu_backend.h
+++ b/encoder/basisu_backend.h
@@ -1,5 +1,5 @@
 // basisu_backend.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@
 		}
 	};
 
-	typedef std::vector<encoder_block> encoder_block_vec;
+	typedef basisu::vector<encoder_block> encoder_block_vec;
 	typedef vector2D<encoder_block> encoder_block_vec2D;
 
 	struct etc1_endpoint_palette_entry
@@ -69,7 +69,7 @@
 		}
 	};
 
-	typedef std::vector<etc1_endpoint_palette_entry> etc1_endpoint_palette_entry_vec;
+	typedef basisu::vector<etc1_endpoint_palette_entry> etc1_endpoint_palette_entry_vec;
 
 	struct basisu_backend_params
 	{
@@ -135,7 +135,7 @@
 		bool m_iframe;
 	};
 
-	typedef std::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;
+	typedef basisu::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;
 
 	struct basisu_backend_output
 	{
@@ -152,7 +152,7 @@
 		basisu_backend_slice_desc_vec m_slice_desc;
 
 		uint8_vec m_slice_image_tables;
-		std::vector<uint8_vec> m_slice_image_data;
+		basisu::vector<uint8_vec> m_slice_image_data;
 		uint16_vec m_slice_image_crcs;
 
 		basisu_backend_output()
@@ -219,11 +219,11 @@
 			bool m_was_used;
 		};
 
-		typedef std::vector<etc1_global_selector_cb_entry_desc> etc1_global_selector_cb_entry_desc_vec;
+		typedef basisu::vector<etc1_global_selector_cb_entry_desc> etc1_global_selector_cb_entry_desc_vec;
 
 		etc1_global_selector_cb_entry_desc_vec m_global_selector_palette_desc;
 
-		std::vector<encoder_block_vec2D> m_slice_encoder_blocks;
+		basisu::vector<encoder_block_vec2D> m_slice_encoder_blocks;
 
 		// Maps OLD to NEW endpoint/selector indices
 		uint_vec m_endpoint_remap_table_old_to_new;
diff --git a/encoder/basisu_basis_file.cpp b/encoder/basisu_basis_file.cpp
index 9662a9a..705ed7e 100644
--- a/encoder/basisu_basis_file.cpp
+++ b/encoder/basisu_basis_file.cpp
@@ -1,5 +1,5 @@
 // basisu_basis_file.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/encoder/basisu_basis_file.h b/encoder/basisu_basis_file.h
index 7d9c577..98498a0 100644
--- a/encoder/basisu_basis_file.h
+++ b/encoder/basisu_basis_file.h
@@ -49,7 +49,7 @@
 
 	private:
 		basist::basis_file_header m_header;
-		std::vector<basist::basis_slice_desc> m_images_descs;
+		basisu::vector<basist::basis_slice_desc> m_images_descs;
 
 		uint8_vec m_comp_data;
 
diff --git a/encoder/basisu_bc7enc.cpp b/encoder/basisu_bc7enc.cpp
index dcec912..06aa7eb 100644
--- a/encoder/basisu_bc7enc.cpp
+++ b/encoder/basisu_bc7enc.cpp
@@ -1,5 +1,5 @@
 // File: basisu_bc7enc.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/encoder/basisu_bc7enc.h b/encoder/basisu_bc7enc.h
index 3af606f..2346991 100644
--- a/encoder/basisu_bc7enc.h
+++ b/encoder/basisu_bc7enc.h
@@ -1,5 +1,5 @@
 // File: basisu_bc7enc.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/encoder/basisu_comp.cpp b/encoder/basisu_comp.cpp
index fdaf3f0..396157d 100644
--- a/encoder/basisu_comp.cpp
+++ b/encoder/basisu_comp.cpp
@@ -1,5 +1,5 @@
 // basisu_comp.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 {
    basis_compressor::basis_compressor() :
 		m_basis_file_size(0),
-		m_basis_bits_per_texel(0),
+		m_basis_bits_per_texel(0.0f),
 		m_total_blocks(0),
 		m_auto_global_sel_pal(false),
 		m_any_source_image_has_alpha(false)
@@ -62,7 +62,7 @@
 			PRINT_BOOL_VALUE(m_debug_images);
 			PRINT_BOOL_VALUE(m_global_sel_pal);
 			PRINT_BOOL_VALUE(m_auto_global_sel_pal);
-			PRINT_BOOL_VALUE(m_compression_level);
+			PRINT_INT_VALUE(m_compression_level);
 			PRINT_BOOL_VALUE(m_no_hybrid_sel_cb);
 			PRINT_BOOL_VALUE(m_perceptual);
 			PRINT_BOOL_VALUE(m_no_endpoint_rdo);
@@ -92,6 +92,7 @@
 			PRINT_BOOL_VALUE(m_mip_gen);
 			PRINT_BOOL_VALUE(m_mip_renormalize);
 			PRINT_BOOL_VALUE(m_mip_wrapping);
+			PRINT_BOOL_VALUE(m_mip_fast);
 			PRINT_BOOL_VALUE(m_mip_srgb);
 			PRINT_FLOAT_VALUE(m_mip_premultiplied);
 			PRINT_FLOAT_VALUE(m_mip_scale);
@@ -112,6 +113,12 @@
 			PRINT_INT_VALUE(m_rdo_uastc_dict_size);
 			PRINT_FLOAT_VALUE(m_rdo_uastc_max_allowed_rms_increase_ratio);
 			PRINT_FLOAT_VALUE(m_rdo_uastc_skip_block_rms_thresh);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_max_smooth_block_error_scale);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_smooth_block_max_std_dev);
+			PRINT_BOOL_VALUE(m_rdo_uastc_favor_simpler_modes_in_rdo_mode)
+			PRINT_BOOL_VALUE(m_rdo_uastc_multithreading);
+
+			PRINT_FLOAT_VALUE(m_resample_factor);
 						
 #undef PRINT_BOOL_VALUE
 #undef PRINT_INT_VALUE
@@ -209,6 +216,10 @@
 #endif
 						BASISU_NOTE_UNUSED(num_blocks_y);
 						
+						uint32_t uastc_flags = m_params.m_pack_uastc_flags;
+						if ((m_params.m_rdo_uastc) && (m_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode))
+							uastc_flags |= cPackUASTCFavorSimplerModes;
+
 						for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 						{
 							const uint32_t block_x = block_index % num_blocks_x;
@@ -220,7 +231,7 @@
 
 							basist::uastc_block& dest_block = *(basist::uastc_block*)tex.get_block_ptr(block_x, block_y);
 
-							encode_uastc(&block_pixels[0][0].r, dest_block, m_params.m_pack_uastc_flags);
+							encode_uastc(&block_pixels[0][0].r, dest_block, uastc_flags);
 
 							total_blocks_processed++;
 							
@@ -245,14 +256,16 @@
 			if (m_params.m_rdo_uastc)
 			{
 				uastc_rdo_params rdo_params;
-				rdo_params.m_quality_scaler = m_params.m_rdo_uastc_quality_scalar;
+				rdo_params.m_lambda = m_params.m_rdo_uastc_quality_scalar;
 				rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_max_allowed_rms_increase_ratio;
 				rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_skip_block_rms_thresh;
 				rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_dict_size;
+				rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_max_smooth_block_error_scale;
+				rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_smooth_block_max_std_dev;
 								
 				bool status = uastc_rdo(tex.get_total_blocks(), (basist::uastc_block*)tex.get_ptr(),
-					(const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_pJob_pool,
-					m_params.m_pJob_pool ? std::min<uint32_t>(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0);
+					(const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_rdo_uastc_multithreading ? m_params.m_pJob_pool : nullptr,
+					(m_params.m_rdo_uastc_multithreading && m_params.m_pJob_pool) ? std::min<uint32_t>(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0);
 				if (!status)
 				{
 					return cECFailedUASTCRDOPostProcess;
@@ -269,10 +282,13 @@
 		return cECSuccess;
 	}
 
-	bool basis_compressor::generate_mipmaps(const image &img, std::vector<image> &mips, bool has_alpha)
+	bool basis_compressor::generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha)
 	{
 		debug_printf("basis_compressor::generate_mipmaps\n");
 
+		interval_timer tm;
+		tm.start();
+
 		uint32_t total_levels = 1;
 		uint32_t w = img.get_width(), h = img.get_height();
 		while (maximum<uint32_t>(w, h) > (uint32_t)m_params.m_mip_smallest_dimension)
@@ -326,10 +342,18 @@
 			const uint32_t level_width = maximum<uint32_t>(1, img.get_width() >> level);
 			const uint32_t level_height = maximum<uint32_t>(1, img.get_height() >> level);
 
-			image &level_img = *enlarge_vector(mips, 1);
+			image& level_img = *enlarge_vector(mips, 1);
 			level_img.resize(level_width, level_height);
 
-			bool status = image_resample(img, level_img, m_params.m_mip_srgb, m_params.m_mip_filter.c_str(), m_params.m_mip_scale, m_params.m_mip_wrapping, 0, has_alpha ? 4 : 3);
+			const image* pSource_image = &img;
+
+			if (m_params.m_mip_fast)
+			{
+				if (level > 1)
+					pSource_image = &mips[level - 1];
+			}
+
+			bool status = image_resample(*pSource_image, level_img, m_params.m_mip_srgb, m_params.m_mip_filter.c_str(), m_params.m_mip_scale, m_params.m_mip_wrapping, 0, has_alpha ? 4 : 3);
 			if (!status)
 			{
 				error_printf("basis_compressor::generate_mipmaps: image_resample() failed!\n");
@@ -341,6 +365,9 @@
 		}
 #endif
 
+		if (m_params.m_debug)
+			debug_printf("Total mipmap generation time: %f secs\n", tm.get_elapsed_secs());
+
 		return true;
 	}
 
@@ -361,8 +388,8 @@
 
 		m_any_source_image_has_alpha = false;
 
-		std::vector<image> source_images;
-		std::vector<std::string> source_filenames;
+		basisu::vector<image> source_images;
+		basisu::vector<std::string> source_filenames;
 		
 		// First load all source images, and determine if any have an alpha channel.
 		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
@@ -457,11 +484,19 @@
 #if DEBUG_CROP_TEXTURE_TO_64x64
 			file_image.resize(64, 64);
 #endif
-#if DEBUG_RESIZE_TEXTURE
-			image temp_img((file_image.get_width() + 1) / 2, (file_image.get_height() + 1) / 2);
-			image_resample(file_image, temp_img, m_params.m_perceptual, "kaiser");
-			temp_img.swap(file_image);
-#endif
+
+			if (m_params.m_resample_factor > 0.0f)
+			{
+				int new_width = std::min<int>(std::max(1, (int)ceilf(file_image.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+				int new_height = std::min<int>(std::max(1, (int)ceilf(file_image.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+
+				debug_printf("Resampling to %ix%i\n", new_width, new_height);
+
+				// TODO: A box filter - kaiser looks too sharp on video.
+				image temp_img(new_width, new_height);
+				image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
+				temp_img.swap(file_image);
+			}
 
 			if ((!file_image.get_width()) || (!file_image.get_height()))
 			{
@@ -487,7 +522,7 @@
 			const std::string &source_filename = source_filenames[source_file_index];
 
 			// Now, for each source image, create the slices corresponding to that image.
-			std::vector<image> slices;
+			basisu::vector<image> slices;
 			
 			slices.reserve(32);
 			slices.push_back(file_image);
@@ -505,7 +540,7 @@
 			if ((m_any_source_image_has_alpha) && (!m_params.m_uastc))
 			{
 				// For ETC1S, if source has alpha, then even mips will have RGB, and odd mips will have alpha in RGB. 
-				std::vector<image> alpha_slices;
+				basisu::vector<image> alpha_slices;
 				uint_vec new_mip_indices;
 
 				alpha_slices.reserve(slices.size() * 2);
@@ -859,11 +894,16 @@
 			float color_endpoint_quality = quality;
 
 			const float endpoint_split_point = 0.5f;
+			
+			// In v1.2 and in previous versions, the endpoint codebook size at quality 128 was 3072. This wasn't quite large enough.
+			const int ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE = 4800;
+			const int MAX_ENDPOINT_CODEBOOK_SIZE = 8192;
+
 			if (color_endpoint_quality <= mid)
 			{
 				color_endpoint_quality = lerp(0.0f, endpoint_split_point, powf(color_endpoint_quality / mid, .65f));
 
-				max_endpoints = clamp<int>(max_endpoints, 256, 3072);
+				max_endpoints = clamp<int>(max_endpoints, 256, ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE);
 				max_endpoints = minimum<uint32_t>(max_endpoints, m_total_blocks);
 								
 				if (max_endpoints < 64)
@@ -874,12 +914,12 @@
 			{
 				color_endpoint_quality = powf((color_endpoint_quality - mid) / (1.0f - mid), 1.6f);
 
-				max_endpoints = clamp<int>(max_endpoints, 256, 8192);
+				max_endpoints = clamp<int>(max_endpoints, 256, MAX_ENDPOINT_CODEBOOK_SIZE);
 				max_endpoints = minimum<uint32_t>(max_endpoints, m_total_blocks);
 								
-				if (max_endpoints < 3072)
-					max_endpoints = 3072;
-				endpoint_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(3072, static_cast<float>(max_endpoints), color_endpoint_quality)), 32, basisu_frontend::cMaxEndpointClusters);
+				if (max_endpoints < ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE)
+					max_endpoints = ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE;
+				endpoint_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE, static_cast<float>(max_endpoints), color_endpoint_quality)), 32, basisu_frontend::cMaxEndpointClusters);
 			}
 						
 			float bits_per_selector_cluster = m_params.m_global_sel_pal ? 21.0f : 14.0f;
@@ -1131,12 +1171,14 @@
 			return false;
 		}
 
-		debug_printf("basisu_comppressor::start_transcoding() took %3.3fms\n", tm.get_elapsed_ms());
+		double start_transcoding_time = tm.get_elapsed_secs();
+
+		debug_printf("basisu_compressor::start_transcoding() took %3.3fms\n", start_transcoding_time * 1000.0f);
 
 		uint32_t total_orig_pixels = 0;
 		uint32_t total_texels = 0;
 
-		double total_time_etc1 = 0;
+		double total_time_etc1s_or_astc = 0;
 
 		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
 		{
@@ -1155,7 +1197,7 @@
 				return false;
 			}
 
-			total_time_etc1 += tm.get_elapsed_secs();
+			total_time_etc1s_or_astc += tm.get_elapsed_secs();
 
 			if (encoded_output.m_tex_format == basist::basis_tex_format::cETC1S)
 			{
@@ -1173,37 +1215,44 @@
 			total_orig_pixels += m_slice_descs[i].m_orig_width * m_slice_descs[i].m_orig_height;
 			total_texels += m_slice_descs[i].m_width * m_slice_descs[i].m_height;
 		}
-								
+												
 		double total_time_bc7 = 0;
 
-		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		if (basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cUASTC4x4) &&
+			basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cETC1S))
 		{
-			gpu_image decoded_texture;
-			decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
-
-			tm.start();
-
-			if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
-				reinterpret_cast<etc_block *>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16))
+			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
 			{
-				error_printf("Transcoding failed to BC7 on slice %u!\n", i);
-				return false;
+				gpu_image decoded_texture;
+				decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
+
+				tm.start();
+
+				if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
+					reinterpret_cast<etc_block*>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16))
+				{
+					error_printf("Transcoding failed to BC7 on slice %u!\n", i);
+					return false;
+				}
+
+				total_time_bc7 += tm.get_elapsed_secs();
+
+				m_decoded_output_textures_bc7[i] = decoded_texture;
 			}
-
-			total_time_bc7 += tm.get_elapsed_secs();
-
-			m_decoded_output_textures_bc7[i] = decoded_texture;
 		}
 
 		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
 		{
 			m_decoded_output_textures[i].unpack(m_decoded_output_textures_unpacked[i]);
-			m_decoded_output_textures_bc7[i].unpack(m_decoded_output_textures_unpacked_bc7[i]);
+
+			if (m_decoded_output_textures_bc7[i].get_pixel_width())
+				m_decoded_output_textures_bc7[i].unpack(m_decoded_output_textures_unpacked_bc7[i]);
 		}
 
-		debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", m_params.m_uastc ? "ASTC" : "ETC1", total_time_etc1 * 1000.0f, total_orig_pixels / total_time_etc1);
+		debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", m_params.m_uastc ? "ASTC" : "ETC1", total_time_etc1s_or_astc * 1000.0f, total_orig_pixels / total_time_etc1s_or_astc);
 
-		debug_printf("Transcoded to BC7 in %3.3fms, %f texels/sec\n", total_time_bc7 * 1000.0f, total_orig_pixels / total_time_bc7);
+		if (total_time_bc7 != 0)
+			debug_printf("Transcoded to BC7 in %3.3fms, %f texels/sec\n", total_time_bc7 * 1000.0f, total_orig_pixels / total_time_bc7);
 
 		debug_printf("Total .basis output file size: %u, %3.3f bits/texel\n", comp_data.size(), comp_data.size() * 8.0f / total_orig_pixels);
 				
@@ -1257,15 +1306,20 @@
 				printf("basis_compressor::create_basis_file_and_transcode:: miniz compression or decompression failed!\n");
 				return false;
 			}
+
 			mz_free(pComp_data);
 			mz_free(pDecomp_data);
+
 			uint32_t total_texels = 0;
 			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
 				total_texels += (m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y) * 16;
+			
+			m_basis_bits_per_texel = comp_size * 8.0f / total_texels;
+
 			debug_printf(".basis file size: %u, LZ compressed file size: %u, %3.2f bits/texel\n",
 				(uint32_t)comp_data.size(),
 				(uint32_t)comp_size,
-				comp_size * 8.0f / total_texels);
+				m_basis_bits_per_texel);
 		}
 
 		m_stats.resize(m_slice_descs.size());
@@ -1330,6 +1384,7 @@
 					debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
 				}
 
+				if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width())
 				{
 					// ---- BC7 stats
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3);
@@ -1418,14 +1473,15 @@
 				{
 					gpu_image decoded_etc1s_or_astc(m_decoded_output_textures[slice_index]);
 					decoded_etc1s_or_astc.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
-					write_compressed_texture_file((out_basename + "_transcoded_etc1s_astc.ktx").c_str(), decoded_etc1s_or_astc);
+					write_compressed_texture_file((out_basename + "_transcoded_etc1s_or_astc.ktx").c_str(), decoded_etc1s_or_astc);
 
 					image temp(m_decoded_output_textures_unpacked[slice_index]);
 					temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
-					save_png(out_basename + "_transcoded_etc1s_astc.png", temp);
+					save_png(out_basename + "_transcoded_etc1s_or_astc.png", temp);
 				}
 
 				// Write decoded BC7 debug images
+				if (m_decoded_output_textures_bc7[slice_index].get_pixel_width())
 				{
 					gpu_image decoded_bc7(m_decoded_output_textures_bc7[slice_index]);
 					decoded_bc7.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
diff --git a/encoder/basisu_comp.h b/encoder/basisu_comp.h
index 7722e3f..c1a5090 100644
--- a/encoder/basisu_comp.h
+++ b/encoder/basisu_comp.h
@@ -1,5 +1,5 @@
 // basisu_comp.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -41,8 +41,8 @@
 
 	const uint32_t BASISU_MAX_SLICES = 0xFFFFFF;
 
-	const int BASISU_RDO_UASTC_DICT_SIZE_DEFAULT = 32768;
-	const int BASISU_RDO_UASTC_DICT_SIZE_MIN = 256;
+	const int BASISU_RDO_UASTC_DICT_SIZE_DEFAULT = 4096; // 32768;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MIN = 64;
 	const int BASISU_RDO_UASTC_DICT_SIZE_MAX = 65536;
 
 	struct image_stats
@@ -202,10 +202,13 @@
 			m_max_selector_clusters(512),
 			m_quality_level(-1),
 			m_pack_uastc_flags(cPackUASTCLevelDefault),
-			m_rdo_uastc_quality_scalar(1.0f, 0.001f, 10.0f),
+			m_rdo_uastc_quality_scalar(1.0f, 0.001f, 50.0f),
 			m_rdo_uastc_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX),
+			m_rdo_uastc_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f),
+			m_rdo_uastc_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f),
 			m_rdo_uastc_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f),
 			m_rdo_uastc_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f),
+			m_resample_factor(0.0f, .00125f, 100.0f),
 			m_pJob_pool(nullptr)
 		{
 			clear();
@@ -262,6 +265,7 @@
 			m_mip_premultiplied.clear();
 			m_mip_renormalize.clear();
 			m_mip_wrapping.clear();
+			m_mip_fast.clear();
 			m_mip_smallest_dimension.clear();
 
 			m_max_endpoint_clusters = 0;
@@ -276,8 +280,14 @@
 			m_pack_uastc_flags = cPackUASTCLevelDefault;
 			m_rdo_uastc.clear();
 			m_rdo_uastc_quality_scalar.clear();
+			m_rdo_uastc_max_smooth_block_error_scale.clear();
+			m_rdo_uastc_smooth_block_max_std_dev.clear();
 			m_rdo_uastc_max_allowed_rms_increase_ratio.clear();
 			m_rdo_uastc_skip_block_rms_thresh.clear();
+			m_rdo_uastc_favor_simpler_modes_in_rdo_mode.clear();
+			m_rdo_uastc_multithreading.clear();
+
+			m_resample_factor.clear();
 
 			m_pJob_pool = nullptr;
 		}
@@ -290,10 +300,10 @@
 
 		// If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG images to read. 
 		// Otherwise, the compressor processes the images in m_source_images.
-		std::vector<std::string> m_source_filenames;
-		std::vector<std::string> m_source_alpha_filenames;
+		basisu::vector<std::string> m_source_filenames;
+		basisu::vector<std::string> m_source_alpha_filenames;
 		
-		std::vector<image> m_source_images;
+		basisu::vector<image> m_source_images;
 		// TODO: Allow caller to supply their own mipmaps
 						
 		// Filename of the output basis file
@@ -368,6 +378,7 @@
 		bool_param<true> m_mip_premultiplied; // not currently supported
 		bool_param<false> m_mip_renormalize; 
 		bool_param<true> m_mip_wrapping;
+		bool_param<true> m_mip_fast;
 		param<int> m_mip_smallest_dimension;
 				
 		// Codebook size (quality) control. 
@@ -388,8 +399,14 @@
 		bool_param<false> m_rdo_uastc;
 		param<float> m_rdo_uastc_quality_scalar;
 		param<int> m_rdo_uastc_dict_size;
+		param<float> m_rdo_uastc_max_smooth_block_error_scale;
+		param<float> m_rdo_uastc_smooth_block_max_std_dev;
 		param<float> m_rdo_uastc_max_allowed_rms_increase_ratio;
 		param<float> m_rdo_uastc_skip_block_rms_thresh;
+		bool_param<true> m_rdo_uastc_favor_simpler_modes_in_rdo_mode;
+		bool_param<true> m_rdo_uastc_multithreading;
+
+		param<float> m_resample_factor;
 
 		job_pool *m_pJob_pool;
 	};
@@ -421,23 +438,23 @@
 
 		const uint8_vec &get_output_basis_file() const { return m_output_basis_file; }
 		
-		const std::vector<image_stats> &get_stats() const { return m_stats; }
+		const basisu::vector<image_stats> &get_stats() const { return m_stats; }
 
 		uint32_t get_basis_file_size() const { return m_basis_file_size; }
 		double get_basis_bits_per_texel() const { return m_basis_bits_per_texel; }
-
+		
 		bool get_any_source_image_has_alpha() const { return m_any_source_image_has_alpha; }
 				
 	private:
 		basis_compressor_params m_params;
 		
-		std::vector<image> m_slice_images;
+		basisu::vector<image> m_slice_images;
 
-		std::vector<image_stats> m_stats;
+		basisu::vector<image_stats> m_stats;
 
 		uint32_t m_basis_file_size;
 		double m_basis_bits_per_texel;
-		
+						
 		basisu_backend_slice_desc_vec m_slice_descs;
 
 		uint32_t m_total_blocks;
@@ -446,23 +463,23 @@
 		basisu_frontend m_frontend;
 		pixel_block_vec m_source_blocks;
 
-		std::vector<gpu_image> m_frontend_output_textures;
+		basisu::vector<gpu_image> m_frontend_output_textures;
 
-		std::vector<gpu_image> m_best_etc1s_images;
-		std::vector<image> m_best_etc1s_images_unpacked;
+		basisu::vector<gpu_image> m_best_etc1s_images;
+		basisu::vector<image> m_best_etc1s_images_unpacked;
 
 		basisu_backend m_backend;
 
 		basisu_file m_basis_file;
 
-		std::vector<gpu_image> m_decoded_output_textures;
-		std::vector<image> m_decoded_output_textures_unpacked;
-		std::vector<gpu_image> m_decoded_output_textures_bc7;
-		std::vector<image> m_decoded_output_textures_unpacked_bc7;
+		basisu::vector<gpu_image> m_decoded_output_textures;
+		basisu::vector<image> m_decoded_output_textures_unpacked;
+		basisu::vector<gpu_image> m_decoded_output_textures_bc7;
+		basisu::vector<image> m_decoded_output_textures_unpacked_bc7;
 
 		uint8_vec m_output_basis_file;
 		
-		std::vector<gpu_image> m_uastc_slice_textures;
+		basisu::vector<gpu_image> m_uastc_slice_textures;
 		basisu_backend_output m_uastc_backend_output;
 
 		bool m_any_source_image_has_alpha;
@@ -475,7 +492,7 @@
 		bool create_basis_file_and_transcode();
 		bool write_output_files_and_compute_stats();
 		error_code encode_slices_to_uastc();
-		bool generate_mipmaps(const image &img, std::vector<image> &mips, bool has_alpha);
+		bool generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha);
 		bool validate_texture_type_constraints();
 	};
 
diff --git a/encoder/basisu_enc.cpp b/encoder/basisu_enc.cpp
index 87b69dc..d046888 100644
--- a/encoder/basisu_enc.cpp
+++ b/encoder/basisu_enc.cpp
@@ -1,5 +1,5 @@
 // basisu_enc.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include "basisu_bc7enc.h"
 #include "apg_bmp.h"
 #include "jpgd.h"
+#include <vector>
 
 #if defined(_WIN32)
 // For QueryPerformanceCounter/QueryPerformanceFrequency
@@ -33,6 +34,9 @@
 {
 	uint64_t interval_timer::g_init_ticks, interval_timer::g_freq;
 	double interval_timer::g_timer_freq;
+#if BASISU_SUPPORT_SSE
+	bool g_cpu_supports_sse41;
+#endif
 
 	uint8_t g_hamming_dist[256] =
 	{
@@ -53,10 +57,12 @@
 		3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 		4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 	};
-
+			
 	// Encoder library initialization (just call once at startup)
 	void basisu_encoder_init()
 	{
+		detect_sse41();
+
 		basist::basisu_transcoder_init();
 		pack_etc1_solid_color_init();
 		//uastc_init();
@@ -400,6 +406,8 @@
 			if ((!has_alpha) || ((image_save_flags & cImageSaveIgnoreAlpha) != 0))
 			{
 				const uint64_t total_bytes = (uint64_t)img.get_width() * 3U * (uint64_t)img.get_height();
+				if (total_bytes > INT_MAX)
+					return false;
 				uint8_vec rgb_pixels(static_cast<size_t>(total_bytes));
 				uint8_t *pDst = &rgb_pixels[0];
 								
@@ -464,7 +472,11 @@
 			}
 		}
 
-		data.resize((size_t)filesize);
+		if (!data.try_resize((size_t)filesize))
+		{
+			fclose(pFile);
+			return false;
+		}
 
 		if (filesize)
 		{
diff --git a/encoder/basisu_enc.h b/encoder/basisu_enc.h
index 58719ad..aadc861 100644
--- a/encoder/basisu_enc.h
+++ b/encoder/basisu_enc.h
@@ -1,5 +1,5 @@
 // basisu_enc.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,6 +30,9 @@
 
 // This module is really just a huge grab bag of classes and helper functions needed by the encoder.
 
+// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.
+#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)
+
 namespace basisu
 {
 	extern uint8_t g_hamming_dist[256];
@@ -38,6 +41,15 @@
 	// This function MUST be called before encoding anything!
 	void basisu_encoder_init();
 
+	// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
+	extern void detect_sse41();
+
+#if BASISU_SUPPORT_SSE
+	extern bool g_cpu_supports_sse41;
+#else
+	const bool g_cpu_supports_sse41 = false;
+#endif
+
 	void error_printf(const char *pFmt, ...);
 
 	// Helpers
@@ -61,6 +73,7 @@
 		v = v * a + 128; 
 		return (uint8_t)((v + (v >> 8)) >> 8);
 	}
+
 	inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
 	{
 		assert(codesize <= 64);
@@ -152,6 +165,7 @@
 			return hash_hsieh(reinterpret_cast<const uint8_t *>(&k), sizeof(k));
 		}
 	};
+
 	class running_stat
 	{
 	public:
@@ -509,7 +523,7 @@
 		
 	private:
 		std::vector<std::thread> m_threads;
-        std::vector<std::function<void()> > m_queue;
+		std::vector<std::function<void()> > m_queue;
 		
 		std::mutex m_mutex;
 		std::condition_variable m_has_work;
@@ -731,7 +745,7 @@
 		static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); }
 	};
 
-	typedef std::vector<color_rgba> color_rgba_vec;
+	typedef basisu::vector<color_rgba> color_rgba_vec;
 
 	const color_rgba g_black_color(0, 0, 0, 255);
 	const color_rgba g_white_color(255, 255, 255, 255);
@@ -761,6 +775,7 @@
 	{
 		if (perceptual)
 		{
+#if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE
 			const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f;
 			const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f;
 
@@ -783,6 +798,49 @@
 			}
 
 			return d;
+#elif 1
+			int dr = e1.r - e2.r;
+			int dg = e1.g - e2.g;
+			int db = e1.b - e2.b;
+
+			int delta_l = dr * 27 + dg * 92 + db * 9;
+			int delta_cr = dr * 128 - delta_l;
+			int delta_cb = db * 128 - delta_l;
+
+			uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) +
+				((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) +
+				((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U);
+
+			if (alpha)
+			{
+				int da = (e1.a - e2.a) << 7;
+				id += ((uint32_t)(da * da) >> 7U);
+			}
+
+			return id;
+#else
+			int dr = e1.r - e2.r;
+			int dg = e1.g - e2.g;
+			int db = e1.b - e2.b;
+
+			int64_t delta_l = dr * 27 + dg * 92 + db * 9;
+			int64_t delta_cr = dr * 128 - delta_l;
+			int64_t delta_cb = db * 128 - delta_l;
+
+			int64_t id = ((delta_l * delta_l) * 128) +
+				((delta_cr * delta_cr) * 26) +
+				((delta_cb * delta_cb) * 3);
+
+			if (alpha)
+			{
+				int64_t da = (e1.a - e2.a);
+				id += (da * da) * 128;
+			}
+
+			int d = (id + 8192) >> 14;
+
+			return d;
+#endif
 		}
 		else
 			return color_distance(e1, e2, alpha);
@@ -1102,7 +1160,7 @@
 			float m_priority;
 		};
 
-		std::vector<entry> m_heap;
+		basisu::vector<entry> m_heap;
 		uint32_t m_size;
 
 		// Push down entry at index
@@ -1134,7 +1192,7 @@
 	public:
 		typedef TrainingVectorType training_vec_type;
 		typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;
-		typedef std::vector< training_vec_with_weight > array_of_weighted_training_vecs;
+		typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs;
 
 		tree_vector_quant() :
 			m_next_codebook_index(0)
@@ -1154,7 +1212,7 @@
 		const array_of_weighted_training_vecs &get_training_vecs() const	{ return m_training_vecs; }
 				array_of_weighted_training_vecs &get_training_vecs()			{ return m_training_vecs; }
 
-		void retrieve(std::vector< std::vector<uint32_t> > &codebook) const
+		void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const
 		{
 			for (uint32_t i = 0; i < m_nodes.size(); i++)
 			{
@@ -1167,7 +1225,7 @@
 			}
 		}
 
-		void retrieve(std::vector<TrainingVectorType> &codebook) const
+		void retrieve(basisu::vector<TrainingVectorType> &codebook) const
 		{
 			for (uint32_t i = 0; i < m_nodes.size(); i++)
 			{
@@ -1180,7 +1238,7 @@
 			}
 		}
 
-		void retrieve(uint32_t max_clusters, std::vector<uint_vec> &codebook) const
+		void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const
       {
 			uint_vec node_stack;
          node_stack.reserve(512);
@@ -1227,7 +1285,7 @@
 			priority_queue var_heap;
 			var_heap.init(max_size, 0, m_nodes[0].m_var);
 
-			std::vector<uint32_t> l_children, r_children;
+			basisu::vector<uint32_t> l_children, r_children;
 
 			// Now split the worst nodes
 			l_children.reserve(m_training_vecs.size() + 1);
@@ -1265,7 +1323,7 @@
 			inline tsvq_node() : m_weight(0), m_origin(cZero), m_left_index(-1), m_right_index(-1), m_codebook_index(-1) { }
 
 			// vecs is erased
-			inline void set(const TrainingVectorType &org, uint64_t weight, float var, std::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
+			inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
 
 			inline bool is_leaf() const { return m_left_index < 0; }
 
@@ -1273,11 +1331,11 @@
 			uint64_t m_weight;
 			TrainingVectorType m_origin;
 			int32_t m_left_index, m_right_index;
-			std::vector<uint32_t> m_training_vecs;
+			basisu::vector<uint32_t> m_training_vecs;
 			int m_codebook_index;
 		};
 
-		typedef std::vector<tsvq_node> tsvq_node_vec;
+		typedef basisu::vector<tsvq_node> tsvq_node_vec;
 		tsvq_node_vec m_nodes;
 
 		array_of_weighted_training_vecs m_training_vecs;
@@ -1312,7 +1370,7 @@
 			return root;
 		}
 
-		bool split_node(uint32_t node_index, priority_queue &var_heap, std::vector<uint32_t> &l_children, std::vector<uint32_t> &r_children)
+		bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children)
 		{
 			TrainingVectorType l_child_org, r_child_org;
 			uint64_t l_weight = 0, r_weight = 0;
@@ -1477,7 +1535,7 @@
 				if (largest_axis_index < 0)
 					return false;
 
-				std::vector<float> keys(node.m_training_vecs.size());
+				basisu::vector<float> keys(node.m_training_vecs.size());
 				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
 					keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];
 
@@ -1525,8 +1583,8 @@
 		}
 
 		bool refine_split(const tsvq_node &node,
-			TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, std::vector<uint32_t> &l_children,
-			TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, std::vector<uint32_t> &r_children) const
+			TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children,
+			TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const
 		{
 			l_children.reserve(node.m_training_vecs.size());
 			r_children.reserve(node.m_training_vecs.size());
@@ -1639,8 +1697,8 @@
 	template<typename Quantizer>
 	bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,
 		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
-		std::vector<uint_vec>& codebook,
-		std::vector<uint_vec>& parent_codebook,
+		basisu::vector<uint_vec>& codebook,
+		basisu::vector<uint_vec>& parent_codebook,
 		uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)
 	{
 		codebook.resize(0);
@@ -1666,7 +1724,7 @@
 		if (!q.generate(max_threads))
 			return false;
 
-		std::vector<uint_vec> initial_codebook;
+		basisu::vector<uint_vec> initial_codebook;
 
 		q.retrieve(initial_codebook);
 
@@ -1685,8 +1743,8 @@
 		bool success_flags[cMaxThreads];
 		clear_obj(success_flags);
 
-		std::vector<uint_vec> local_clusters[cMaxThreads];
-		std::vector<uint_vec> local_parent_clusters[cMaxThreads];
+		basisu::vector<uint_vec> local_clusters[cMaxThreads];
+		basisu::vector<uint_vec> local_parent_clusters[cMaxThreads];
 
 		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
 		{
@@ -1777,8 +1835,8 @@
 	template<typename Quantizer>
 	bool generate_hierarchical_codebook_threaded(Quantizer& q,
 		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
-		std::vector<uint_vec>& codebook,
-		std::vector<uint_vec>& parent_codebook,
+		basisu::vector<uint_vec>& codebook,
+		basisu::vector<uint_vec>& parent_codebook,
 		uint32_t max_threads, job_pool *pJob_pool)
 	{
 		typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
@@ -1808,7 +1866,7 @@
 
 		Quantizer group_quant;
 		typedef typename group_hash::const_iterator group_hash_const_iter;
-		std::vector<group_hash_const_iter> unique_vec_iters;
+		basisu::vector<group_hash_const_iter> unique_vec_iters;
 		unique_vec_iters.reserve(unique_vecs.size());
 
 		for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)
@@ -1823,7 +1881,7 @@
 
 		debug_printf("Limit clusterizers: %u\n", limit_clusterizers);
 
-		std::vector<uint_vec> group_codebook, group_parent_codebook;
+		basisu::vector<uint_vec> group_codebook, group_parent_codebook;
 		bool status = generate_hierarchical_codebook_threaded_internal(group_quant,
 			max_codebook_size, max_parent_codebook_size,
 			group_codebook,
@@ -1872,7 +1930,7 @@
 
 	class histogram
 	{
-		std::vector<uint32_t> m_hist;
+		basisu::vector<uint32_t> m_hist;
 
 	public:
 		histogram(uint32_t size = 0) { init(size); }
@@ -2611,7 +2669,7 @@
 
 	// Float images
 
-	typedef std::vector<vec4F> vec4F_vec;
+	typedef basisu::vector<vec4F> vec4F_vec;
 
 	class imagef
 	{
@@ -2941,7 +2999,7 @@
 	template<typename T>
 	class vector2D
 	{
-		typedef std::vector<T> TVec;
+		typedef basisu::vector<T> TVec;
 
 		uint32_t m_width, m_height;
 		TVec m_values;
diff --git a/encoder/basisu_etc.cpp b/encoder/basisu_etc.cpp
index 7d61c41..34cca98 100644
--- a/encoder/basisu_etc.cpp
+++ b/encoder/basisu_etc.cpp
@@ -1,5 +1,5 @@
 // basis_etc.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
 // limitations under the License.
 #include "basisu_etc.h"
 
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
 #define BASISU_DEBUG_ETC_ENCODER 0
 #define BASISU_DEBUG_ETC_ENCODER_DEEPER 0
 
@@ -756,9 +761,11 @@
 	{
 		assert(m_pResult->m_pSelectors);
 
-		if ((m_pParams->m_pForce_selectors) || (m_pParams->m_pEval_solution_override))
+		if (m_pParams->m_pForce_selectors)
 		{
 			assert(m_pParams->m_quality >= cETCQualitySlow);
+			if (m_pParams->m_quality < cETCQualitySlow)
+				return false;
 		}
 
 		const uint32_t n = m_pParams->m_num_src_pixels;
@@ -768,7 +775,7 @@
 			if (m_pParams->m_quality == cETCQualityFast)
 				compute_internal_cluster_fit(4);
 			else if (m_pParams->m_quality == cETCQualityMedium)
-				compute_internal_cluster_fit(32);
+				compute_internal_cluster_fit(16);
 			else if (m_pParams->m_quality == cETCQualitySlow)
 				compute_internal_cluster_fit(64);
 			else
@@ -783,23 +790,27 @@
 			return false;
 		}
 
-		const uint8_t* pSelectors = &m_best_solution.m_selectors[0];
+		//const uint8_t* pSelectors = &m_best_solution.m_selectors[0];
+		const uint8_t* pSelectors = m_pParams->m_pForce_selectors ? m_pParams->m_pForce_selectors : &m_best_solution.m_selectors[0];
 
-#ifdef BASISU_BUILD_DEBUG
-		if (m_pParams->m_pEval_solution_override == nullptr)
+#if defined(DEBUG) || defined(_DEBUG)
 		{
+			// sanity check the returned error
 			color_rgba block_colors[4];
 			m_best_solution.m_coords.get_block_colors(block_colors);
 
 			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
 			uint64_t actual_error = 0;
+			
+			bool perceptual;
+			if (m_pParams->m_quality >= cETCQualityMedium)
+				perceptual = m_pParams->m_perceptual;
+			else
+				perceptual = (m_pParams->m_quality == cETCQualityFast) ? false : m_pParams->m_perceptual;
+						
 			for (uint32_t i = 0; i < n; i++)
-			{
-				if ((m_pParams->m_perceptual) && (m_pParams->m_quality >= cETCQualitySlow))
-					actual_error += color_distance(true, pSrc_pixels[i], block_colors[pSelectors[i]], false);
-				else
-					actual_error += color_distance(pSrc_pixels[i], block_colors[pSelectors[i]], false);
-			}
+				actual_error += color_distance(perceptual, pSrc_pixels[i], block_colors[pSelectors[i]], false);
+
 			assert(actual_error == m_best_solution.m_error);
 		}
 #endif      
@@ -988,9 +999,21 @@
 		m_sorted_luma_indices.resize(n);
 		m_sorted_luma.resize(n);
 		
+		int min_r = 255, min_g = 255, min_b = 255;
+		int max_r = 0, max_g = 0, max_b = 0;
+		
 		for (uint32_t i = 0; i < n; i++)
 		{
 			const color_rgba& c = m_pParams->m_pSrc_pixels[i];
+
+			min_r = std::min<int>(min_r, c.r);
+			min_g = std::min<int>(min_g, c.g);
+			min_b = std::min<int>(min_b, c.b);
+
+			max_r = std::max<int>(max_r, c.r);
+			max_g = std::max<int>(max_g, c.g);
+			max_b = std::max<int>(max_b, c.b);
+
 			const vec3F fc(c.r, c.g, c.b);
 
 			avg_color += fc;
@@ -1000,7 +1023,8 @@
 		}
 		avg_color /= static_cast<float>(n);
 		m_avg_color = avg_color;
-
+		m_max_comp_spread = std::max(std::max(max_r - min_r, max_g - min_g), max_b - min_b);
+		
 		m_br = clamp<int>(static_cast<uint32_t>(m_avg_color[0] * m_limit / 255.0f + .5f), 0, m_limit);
 		m_bg = clamp<int>(static_cast<uint32_t>(m_avg_color[1] * m_limit / 255.0f + .5f), 0, m_limit);
 		m_bb = clamp<int>(static_cast<uint32_t>(m_avg_color[2] * m_limit / 255.0f + .5f), 0, m_limit);
@@ -1009,7 +1033,7 @@
 		printf("Avg block color: %u %u %u\n", m_br, m_bg, m_bb);
 #endif
 
-		if (m_pParams->m_quality <= cETCQualityMedium)
+		if (m_pParams->m_quality == cETCQualityFast)
 		{
 			indirect_sort(n, &m_sorted_luma_indices[0], &m_luma[0]);
 
@@ -1024,13 +1048,45 @@
 		m_best_solution.m_valid = false;
 		m_best_solution.m_error = UINT64_MAX;
 
-		m_solutions_tried.clear();
+		clear_obj(m_solutions_tried);
 	}
 
+	// Return false if we've probably already tried this solution, true if we have definitely not.
+	bool etc1_optimizer::check_for_redundant_solution(const etc1_solution_coordinates& coords)
+	{
+		// Hash first 3 bytes of color (RGB)
+		uint32_t kh = hash_hsieh((uint8_t*)&coords.m_unscaled_color.r, 3);
+
+		uint32_t h0 = kh & cSolutionsTriedHashMask;
+		uint32_t h1 = (kh >> cSolutionsTriedHashBits) & cSolutionsTriedHashMask;
+
+		// Simple Bloom filter lookup with k=2
+		if ( ((m_solutions_tried[h0 >> 3] & (1 << (h0 & 7))) != 0) &&
+		     ((m_solutions_tried[h1 >> 3] & (1 << (h1 & 7))) != 0) )
+			return false;
+
+		m_solutions_tried[h0 >> 3] |= (1 << (h0 & 7));
+		m_solutions_tried[h1 >> 3] |= (1 << (h1 & 7));
+
+		return true;
+	}
+		
+	static uint8_t g_eval_dist_tables[8][256] =
+	{
+		// 99% threshold
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,},
+		{ 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,0,1,1,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,},
+		{ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,},
+		{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,}
+	};
+
 	bool etc1_optimizer::evaluate_solution_slow(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
 	{
-		uint32_t k = coords.m_unscaled_color.r | (coords.m_unscaled_color.g << 8) | (coords.m_unscaled_color.b << 16);
-		if (!m_solutions_tried.insert(k).second)
+		if (!check_for_redundant_solution(coords))
 			return false;
 
 #if BASISU_DEBUG_ETC_ENCODER_DEEPER
@@ -1059,12 +1115,39 @@
 		const uint32_t n = m_pParams->m_num_src_pixels;
 		assert(trial_solution.m_selectors.size() == n);
 
-		trial_solution.m_error = UINT64_MAX;
+		trial_solution.m_error = INT64_MAX;
 
 		const uint8_t *pSelectors_to_use = m_pParams->m_pForce_selectors;
 
 		for (uint32_t inten_table = 0; inten_table < cETC1IntenModifierValues; inten_table++)
 		{
+			if (m_pParams->m_quality <= cETCQualityMedium)
+			{
+				if (!g_eval_dist_tables[inten_table][m_max_comp_spread])
+					continue;
+			}
+#if 0
+			if (m_pParams->m_quality <= cETCQualityMedium)
+			{
+				// For tables 5-7, if the max component spread falls within certain ranges, skip the inten table. Statistically they are extremely unlikely to result in lower error.
+				if (inten_table == 7)
+				{
+					if (m_max_comp_spread < 42)
+						continue;
+				}
+				else if (inten_table == 6)
+				{
+					if ((m_max_comp_spread >= 12) && (m_max_comp_spread <= 31))
+						continue;
+				}
+				else if (inten_table == 5)
+				{
+					if ((m_max_comp_spread >= 13) && (m_max_comp_spread <= 21))
+						continue;
+				}
+			}
+#endif
+
 			const int* pInten_table = g_etc1_inten_tables[inten_table];
 
 			color_rgba block_colors[4];
@@ -1077,55 +1160,72 @@
 			uint64_t total_error = 0;
 
 			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
-			for (uint32_t c = 0; c < n; c++)
+
+			if (!g_cpu_supports_sse41)
 			{
-				const color_rgba& src_pixel = *pSrc_pixels++;
+				for (uint32_t c = 0; c < n; c++)
+				{
+					const color_rgba& src_pixel = *pSrc_pixels++;
 
-				uint32_t best_selector_index = 0;
-				uint32_t best_error = 0;
+					uint32_t best_selector_index = 0;
+					uint32_t best_error = 0;
 
+					if (pSelectors_to_use)
+					{
+						best_selector_index = pSelectors_to_use[c];
+						best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[best_selector_index], false);
+					}
+					else
+					{
+						best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[0], false);
+
+						uint32_t trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[1], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 1;
+						}
+
+						trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[2], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 2;
+						}
+
+						trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[3], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 3;
+						}
+					}
+
+					m_temp_selectors[c] = static_cast<uint8_t>(best_selector_index);
+
+					total_error += best_error;
+					if (total_error >= trial_solution.m_error)
+						break;
+				}
+			}
+			else
+			{
+#if BASISU_SUPPORT_SSE
 				if (pSelectors_to_use)
 				{
-					best_selector_index = pSelectors_to_use[c];
-					best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[best_selector_index], false);
+					if (m_pParams->m_perceptual)
+						perceptual_distance_rgb_4_N_sse41((int64_t*)&total_error, pSelectors_to_use, block_colors, pSrc_pixels, n, trial_solution.m_error);
+					else
+						linear_distance_rgb_4_N_sse41((int64_t*)&total_error, pSelectors_to_use, block_colors, pSrc_pixels, n, trial_solution.m_error);
 				}
 				else
 				{
-					best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[0], false);
-
-					uint32_t trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[1], false);
-					if (trial_error < best_error)
-					{
-						best_error = trial_error;
-						best_selector_index = 1;
-					}
-
-					trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[2], false);
-					if (trial_error < best_error)
-					{
-						best_error = trial_error;
-						best_selector_index = 2;
-					}
-
-					trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[3], false);
-					if (trial_error < best_error)
-					{
-						best_error = trial_error;
-						best_selector_index = 3;
-					}
+					if (m_pParams->m_perceptual)
+						find_selectors_perceptual_rgb_4_N_sse41((int64_t*)&total_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, trial_solution.m_error);
+					else
+						find_selectors_linear_rgb_4_N_sse41((int64_t*)&total_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, trial_solution.m_error);
 				}
-
-				m_temp_selectors[c] = static_cast<uint8_t>(best_selector_index);
-
-				total_error += best_error;
-				if ((m_pParams->m_pEval_solution_override == nullptr) && (total_error >= trial_solution.m_error))
-					break;
-			}
-
-			if (m_pParams->m_pEval_solution_override)
-			{
-				if (!(*m_pParams->m_pEval_solution_override)(total_error, *m_pParams, block_colors, &m_temp_selectors[0], coords))
-					return false;
+#endif
 			}
 
 			if (total_error < trial_solution.m_error)
@@ -1138,7 +1238,7 @@
 		}
 		trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
 		trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
-
+				
 #if BASISU_DEBUG_ETC_ENCODER_DEEPER
 		printf("Eval done: %u error: %I64u best error so far: %I64u\n", (trial_solution.m_error < pBest_solution->m_error), trial_solution.m_error, pBest_solution->m_error);
 #endif
@@ -1152,14 +1252,13 @@
 				success = true;
 			}
 		}
-
+				
 		return success;
 	}
 
 	bool etc1_optimizer::evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
 	{
-		uint32_t k = coords.m_unscaled_color.r | (coords.m_unscaled_color.g << 8) | (coords.m_unscaled_color.b << 16);
-		if (!m_solutions_tried.insert(k).second)
+		if (!check_for_redundant_solution(coords))
 			return false;
 
 #if BASISU_DEBUG_ETC_ENCODER_DEEPER
@@ -1184,12 +1283,14 @@
 		}
 
 		const color_rgba base_color(coords.get_scaled_color());
-
+		
 		const uint32_t n = m_pParams->m_num_src_pixels;
 		assert(trial_solution.m_selectors.size() == n);
 
 		trial_solution.m_error = UINT64_MAX;
-
+								
+		const bool perceptual = (m_pParams->m_quality == cETCQualityFast) ? false : m_pParams->m_perceptual;
+				
 		for (int inten_table = cETC1IntenModifierValues - 1; inten_table >= 0; --inten_table)
 		{
 			const int* pInten_table = g_etc1_inten_tables[inten_table];
@@ -1209,57 +1310,147 @@
 			// 0   1   2   3
 			//   01  12  23
 			const uint32_t block_inten_midpoints[3] = { block_inten[0] + block_inten[1], block_inten[1] + block_inten[2], block_inten[2] + block_inten[3] };
-
+															
 			uint64_t total_error = 0;
 			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
-			if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
+						
+			if (perceptual)
 			{
-				if (block_inten[0] > m_pSorted_luma[n - 1])
+				if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
 				{
-					const uint32_t min_error = iabs((int)block_inten[0] - (int)m_pSorted_luma[n - 1]);
-					if (min_error >= trial_solution.m_error)
-						continue;
+					if (block_inten[0] > m_pSorted_luma[n - 1])
+					{
+						const uint32_t min_error = iabs((int)block_inten[0] - (int)m_pSorted_luma[n - 1]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 0, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(true, block_colors[0], pSrc_pixels[c], false);
 				}
-
-				memset(&m_temp_selectors[0], 0, n);
-
-				for (uint32_t c = 0; c < n; c++)
-					total_error += color_distance(block_colors[0], pSrc_pixels[c], false);
-			}
-			else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
-			{
-				if (m_pSorted_luma[0] > block_inten[3])
+				else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
 				{
-					const uint32_t min_error = iabs((int)m_pSorted_luma[0] - (int)block_inten[3]);
-					if (min_error >= trial_solution.m_error)
-						continue;
+					if (m_pSorted_luma[0] > block_inten[3])
+					{
+						const uint32_t min_error = iabs((int)m_pSorted_luma[0] - (int)block_inten[3]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 3, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(true, block_colors[3], pSrc_pixels[c], false);
 				}
+				else
+				{
+					if (!g_cpu_supports_sse41)
+					{
+						uint32_t cur_selector = 0, c;
+						for (c = 0; c < n; c++)
+						{
+							const uint32_t y = m_pSorted_luma[c];
+							while ((y * 2) >= block_inten_midpoints[cur_selector])
+								if (++cur_selector > 2)
+									goto done;
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+							total_error += color_distance(true, block_colors[cur_selector], pSrc_pixels[sorted_pixel_index], false);
+						}
+					done:
+						while (c < n)
+						{
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = 3;
+							total_error += color_distance(true, block_colors[3], pSrc_pixels[sorted_pixel_index], false);
+							++c;
+						}
+					}
+					else
+					{
+#if BASISU_SUPPORT_SSE
+						uint32_t cur_selector = 0, c;
 
-				memset(&m_temp_selectors[0], 3, n);
+						for (c = 0; c < n; c++)
+						{
+							const uint32_t y = m_pSorted_luma[c];
+							while ((y * 2) >= block_inten_midpoints[cur_selector])
+							{
+								if (++cur_selector > 2)
+									goto done3;
+							}
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+						}
+					done3:
 
-				for (uint32_t c = 0; c < n; c++)
-					total_error += color_distance(block_colors[3], pSrc_pixels[c], false);
+						while (c < n)
+						{
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = 3;
+							++c;
+						}
+
+						int64_t block_error;
+						perceptual_distance_rgb_4_N_sse41(&block_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, INT64_MAX);
+						total_error += block_error;
+#endif
+					}
+				}
 			}
 			else
 			{
-				uint32_t cur_selector = 0, c;
-				for (c = 0; c < n; c++)
+				if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
 				{
-					const uint32_t y = m_pSorted_luma[c];
-					while ((y * 2) >= block_inten_midpoints[cur_selector])
-						if (++cur_selector > 2)
-							goto done;
-					const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
-					m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
-					total_error += color_distance(block_colors[cur_selector], pSrc_pixels[sorted_pixel_index], false);
+					if (block_inten[0] > m_pSorted_luma[n - 1])
+					{
+						const uint32_t min_error = iabs((int)block_inten[0] - (int)m_pSorted_luma[n - 1]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 0, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(block_colors[0], pSrc_pixels[c], false);
 				}
-			done:
-				while (c < n)
+				else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
 				{
-					const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
-					m_temp_selectors[sorted_pixel_index] = 3;
-					total_error += color_distance(block_colors[3], pSrc_pixels[sorted_pixel_index], false);
-					++c;
+					if (m_pSorted_luma[0] > block_inten[3])
+					{
+						const uint32_t min_error = iabs((int)m_pSorted_luma[0] - (int)block_inten[3]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 3, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(block_colors[3], pSrc_pixels[c], false);
+				}
+				else
+				{
+					uint32_t cur_selector = 0, c;
+					for (c = 0; c < n; c++)
+					{
+						const uint32_t y = m_pSorted_luma[c];
+						while ((y * 2) >= block_inten_midpoints[cur_selector])
+							if (++cur_selector > 2)
+								goto done2;
+						const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+						m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+						total_error += color_distance(block_colors[cur_selector], pSrc_pixels[sorted_pixel_index], false);
+					}
+				done2:
+					while (c < n)
+					{
+						const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+						m_temp_selectors[sorted_pixel_index] = 3;
+						total_error += color_distance(block_colors[3], pSrc_pixels[sorted_pixel_index], false);
+						++c;
+					}
 				}
 			}
 
diff --git a/encoder/basisu_etc.h b/encoder/basisu_etc.h
index 0c8a7be..1e3ece4 100644
--- a/encoder/basisu_etc.h
+++ b/encoder/basisu_etc.h
@@ -1,5 +1,5 @@
 // basis_etc.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
 #pragma once
 #include "../transcoder/basisu.h"
 #include "basisu_enc.h"
-#include <set>
 
 namespace basisu
 {
@@ -758,7 +757,7 @@
 		}
 	};
 		
-	typedef std::vector<etc_block> etc_block_vec;
+	typedef basisu::vector<etc_block> etc_block_vec;
 
 	// Returns false if the unpack fails (could be bogus data or ETC2)
 	bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha = false);
@@ -882,10 +881,10 @@
 				bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
 			}
 			const int* pInten_table = g_etc1_inten_tables[m_inten_table];
-			pBlock_colors[0].set((uint8_t)(br + pInten_table[0]), (uint8_t)(bg + pInten_table[0]), (uint8_t)(bb + pInten_table[0]), 255);
-			pBlock_colors[1].set((uint8_t)(br + pInten_table[1]), (uint8_t)(bg + pInten_table[1]), (uint8_t)(bb + pInten_table[1]), 255);
-			pBlock_colors[2].set((uint8_t)(br + pInten_table[2]), (uint8_t)(bg + pInten_table[2]), (uint8_t)(bb + pInten_table[2]), 255);
-			pBlock_colors[3].set((uint8_t)(br + pInten_table[3]), (uint8_t)(bg + pInten_table[3]), (uint8_t)(bb + pInten_table[3]), 255);
+			pBlock_colors[0].set(br + pInten_table[0], bg + pInten_table[0], bb + pInten_table[0], 255);
+			pBlock_colors[1].set(br + pInten_table[1], bg + pInten_table[1], bb + pInten_table[1], 255);
+			pBlock_colors[2].set(br + pInten_table[2], bg + pInten_table[2], bb + pInten_table[2], 255);
+			pBlock_colors[3].set(br + pInten_table[3], bg + pInten_table[3], bb + pInten_table[3], 255);
 		}
 
 		color_rgba m_unscaled_color;
@@ -952,9 +951,6 @@
 				m_refinement = true;
 
 				m_pForce_selectors = nullptr;
-
-				m_pEval_solution_override = nullptr;
-				m_pEval_solution_override_data = nullptr;
 			}
 
 			uint32_t m_num_src_pixels;
@@ -970,9 +966,6 @@
 			bool m_refinement;
 
 			const uint8_t* m_pForce_selectors;
-
-			evaluate_solution_override_func m_pEval_solution_override;
-			void *m_pEval_solution_override_data;
 		};
 
 		struct results
@@ -1008,7 +1001,7 @@
 			}
 
 			etc1_solution_coordinates  m_coords;
-			std::vector<uint8_t>    m_selectors;
+			basisu::vector<uint8_t>    m_selectors;
 			uint64_t                     m_error;
 			bool                       m_valid;
 
@@ -1039,33 +1032,36 @@
 
 		vec3F m_avg_color;
 		int m_br, m_bg, m_bb;
-		std::vector<uint16_t> m_luma;
-		std::vector<uint32_t> m_sorted_luma;
-		std::vector<uint32_t> m_sorted_luma_indices;
+		int m_max_comp_spread;
+		basisu::vector<uint16_t> m_luma;
+		basisu::vector<uint32_t> m_sorted_luma;
+		basisu::vector<uint32_t> m_sorted_luma_indices;
 		const uint32_t* m_pSorted_luma_indices;
 		uint32_t* m_pSorted_luma;
 
-		std::vector<uint8_t> m_selectors;
-		std::vector<uint8_t> m_best_selectors;
+		basisu::vector<uint8_t> m_selectors;
+		basisu::vector<uint8_t> m_best_selectors;
 
 		potential_solution m_best_solution;
 		potential_solution m_trial_solution;
-		std::vector<uint8_t> m_temp_selectors;
+		basisu::vector<uint8_t> m_temp_selectors;
 
-		std::set<uint32_t> m_solutions_tried;
-
+		enum { cSolutionsTriedHashBits = 10, cTotalSolutionsTriedHashSize = 1 << cSolutionsTriedHashBits, cSolutionsTriedHashMask = cTotalSolutionsTriedHashSize - 1 };
+		uint8_t m_solutions_tried[cTotalSolutionsTriedHashSize / 8];
+		
 		void get_nearby_inten_tables(uint32_t idx, int &first_inten_table, int &last_inten_table)
 		{
 			first_inten_table = maximum<int>(idx - 1, 0);
 			last_inten_table = minimum<int>(cETC1IntenModifierValues, idx + 1);
 		}
-
+		
+		bool check_for_redundant_solution(const etc1_solution_coordinates& coords);
 		bool evaluate_solution_slow(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
 		bool evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
 
 		inline bool evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
 		{
-			if (m_pParams->m_quality >= cETCQualitySlow)
+			if (m_pParams->m_quality >= cETCQualityMedium)
 				return evaluate_solution_slow(coords, trial_solution, pBest_solution);
 			else
 				return evaluate_solution_fast(coords, trial_solution, pBest_solution);
diff --git a/encoder/basisu_frontend.cpp b/encoder/basisu_frontend.cpp
index 35ad68f..0039db0 100644
--- a/encoder/basisu_frontend.cpp
+++ b/encoder/basisu_frontend.cpp
@@ -1,5 +1,5 @@
 // basisu_frontend.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,6 +22,11 @@
 #include <unordered_set>
 #include <unordered_map>
 
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
 #define BASISU_FRONTEND_VERIFY(c) do { if (!(c)) handle_verify_failure(__LINE__); } while(0)
 
 namespace basisu
@@ -32,7 +37,8 @@
 	//const uint32_t BASISU_MAX_SELECTOR_REFINEMENT_STEPS = 3;
 
 	const uint32_t BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE = 16;
-	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE = 16;
+	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 = 32;
+	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT = 16;
 	
 	// TODO - How to handle internal verifies in the basisu lib
 	static inline void handle_verify_failure(int line)
@@ -57,14 +63,14 @@
 
 			uint32_t tv = size / sizeof(vec6F_quantizer::training_vec_with_weight);
 
-			std::vector<vec6F_quantizer::training_vec_with_weight> v(tv);
+			basisu::vector<vec6F_quantizer::training_vec_with_weight> v(tv);
 			fread(&v[0], 1, sizeof(v[0]) * tv, pFile);
 
 			for (uint32_t i = 0; i < tv; i++)
 				m_endpoint_clusterizer.add_training_vec(v[i].first, v[i].second);
 
 			m_endpoint_clusterizer.generate(16128);
-			std::vector<uint_vec> codebook;
+			basisu::vector<uint_vec> codebook;
 			m_endpoint_clusterizer.retrieve(codebook);
 
 			printf("Generated %u entries\n", (uint32_t)codebook.size());
@@ -129,11 +135,19 @@
 		case 2:
 		{
 			m_endpoint_refinement = true;
+			m_use_hierarchical_endpoint_codebooks = true;
+			m_use_hierarchical_selector_codebooks = true;
+
+			break;
+		}
+		case 3:
+		{
+			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = false;
 			m_use_hierarchical_selector_codebooks = false;
 			break;
 		}
-		case 3:
+		case 4:
 		{
 			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = true;
@@ -142,7 +156,7 @@
 			m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;
 			break;
 		}
-		case 4:
+		case 5:
 		{
 			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = false;
@@ -151,7 +165,8 @@
 			m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;
 			break;
 		}
-		case 5:
+		case 6:
+		default:
 		{
 			m_endpoint_refinement = true;
 			m_use_hierarchical_endpoint_codebooks = false;
@@ -269,7 +284,7 @@
 
 				introduce_special_selector_clusters();
 				
-				if ((m_params.m_compression_level >= 3) || (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
+				if ((m_params.m_compression_level >= 4) || (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
 				{
 					if (!refine_block_endpoints_given_selectors())
 						break;
@@ -443,9 +458,9 @@
 			m_block_selector_cluster_index[i] = old_to_new[m_block_selector_cluster_index[i]];
 		}
 
-		std::vector<etc_block> new_optimized_cluster_selectors(m_optimized_cluster_selectors.size() ? total_new_entries : 0);
+		basisu::vector<etc_block> new_optimized_cluster_selectors(m_optimized_cluster_selectors.size() ? total_new_entries : 0);
 		basist::etc1_global_selector_codebook_entry_id_vec new_optimized_cluster_selector_global_cb_ids(m_optimized_cluster_selector_global_cb_ids.size() ? total_new_entries : 0);
-		std::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_indices.size() ? total_new_entries : 0);
+		basisu::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_indices.size() ? total_new_entries : 0);
 		bool_vec new_selector_cluster_uses_global_cb(m_selector_cluster_uses_global_cb.size() ? total_new_entries : 0);
 
 		for (uint32_t i = 0; i < total_new_entries; i++)
@@ -474,6 +489,9 @@
 	void basisu_frontend::init_etc1_images()
 	{
 		debug_printf("basisu_frontend::init_etc1_images\n");
+
+		interval_timer tm;
+		tm.start();
 				
 		m_etc1_blocks_etc1s.resize(m_total_blocks);
 
@@ -497,6 +515,8 @@
 			
 					if (m_params.m_compression_level == 0)
 						optimizer_params.m_quality = cETCQualityFast;
+					else if (m_params.m_compression_level == 1)
+						optimizer_params.m_quality = cETCQualityMedium;
 					else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
 						optimizer_params.m_quality = cETCQualityUber;
 						
@@ -509,7 +529,8 @@
 					optimizer_results.m_n = 16;
 
 					optimizer.init(optimizer_params, optimizer_results);
-					optimizer.compute();
+					if (!optimizer.compute())
+						BASISU_FRONTEND_VERIFY(false);
 
 					etc_block &blk = m_etc1_blocks_etc1s[block_index];
 
@@ -532,6 +553,8 @@
 #ifndef __EMSCRIPTEN__
 		m_params.m_pJob_pool->wait_for_all();
 #endif
+
+		debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
 	}
 
 	void basisu_frontend::init_endpoint_training_vectors()
@@ -663,7 +686,7 @@
 
 		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -727,13 +750,13 @@
 
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
-					const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+					const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 					assert(cluster_indices.size());
 
 					for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 					{
-						std::vector<color_rgba> cluster_pixels(8);
+						basisu::vector<color_rgba> cluster_pixels(8);
 
 						const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
 						const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
@@ -921,13 +944,13 @@
 
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
-					const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+					const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 					BASISU_FRONTEND_VERIFY(cluster_indices.size());
 
 					const uint32_t total_pixels = (uint32_t)cluster_indices.size() * 8;
 
-					std::vector<color_rgba> cluster_pixels(total_pixels);
+					basisu::vector<color_rgba> cluster_pixels(total_pixels);
 
 					for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 					{
@@ -958,20 +981,21 @@
 						cluster_optimizer_params.m_use_color4 = false;
 						cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
 
-						if (m_params.m_compression_level == 0)
+						if (m_params.m_compression_level <= 1)
 							cluster_optimizer_params.m_quality = cETCQualityMedium;
 						else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
 							cluster_optimizer_params.m_quality = cETCQualityUber;
 
 						etc1_optimizer::results cluster_optimizer_results;
 
-						std::vector<uint8_t> cluster_selectors(total_pixels);
+						basisu::vector<uint8_t> cluster_selectors(total_pixels);
 						cluster_optimizer_results.m_n = total_pixels;
 						cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
 
 						optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
 
-						optimizer.compute();
+						if (!optimizer.compute())
+							BASISU_FRONTEND_VERIFY(false);
 
 						new_subblock_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
 						new_subblock_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
@@ -1048,11 +1072,11 @@
 
 	bool basisu_frontend::check_etc1s_constraints() const
 	{
-		std::vector<vec2U> block_clusters(m_total_blocks);
+		basisu::vector<vec2U> block_clusters(m_total_blocks);
 
 		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -1080,11 +1104,11 @@
 		if (m_use_hierarchical_endpoint_codebooks)
 			compute_endpoint_clusters_within_each_parent_cluster();
 
-		std::vector<vec2U> block_clusters(m_total_blocks);
+		basisu::vector<vec2U> block_clusters(m_total_blocks);
 
 		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -1114,15 +1138,13 @@
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 				{
-					//const bool is_flipped = true;
-			
 					const uint32_t cluster_index = block_clusters[block_index][0];
 					BASISU_FRONTEND_VERIFY(cluster_index == block_clusters[block_index][1]);
 
-					const color_rgba *subblock_pixels = get_source_pixel_block(block_index).get_ptr();
+					const color_rgba *pSubblock_pixels = get_source_pixel_block(block_index).get_ptr();
 					const uint32_t num_subblock_pixels = 16;
 
-					uint64_t best_cluster_err = UINT64_MAX;
+					uint64_t best_cluster_err = INT64_MAX;
 					uint32_t best_cluster_index = 0;
 
 					const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster.size() ? m_block_parent_endpoint_cluster[block_index] : 0;
@@ -1145,19 +1167,20 @@
 						// Can't assign it here - may result in too much error when selector quant occurs
 						if (cluster_etc_inten > m_endpoint_cluster_etc_params[cluster_index].m_inten_table[0])
 						{
-							total_err = UINT64_MAX;
+							total_err = INT64_MAX;
 							goto skip_cluster;
 						}
 
 						etc_block::get_block_colors5(subblock_colors, cluster_etc_base_color, cluster_etc_inten);
-
+												
+#if 0
 						for (uint32_t p = 0; p < num_subblock_pixels; p++)
 						{
 							uint64_t best_err = UINT64_MAX;
 
 							for (uint32_t r = low_selector; r <= high_selector; r++)
 							{
-								uint64_t err = color_distance(m_params.m_perceptual, subblock_pixels[p], subblock_colors[r], false);
+								uint64_t err = color_distance(m_params.m_perceptual, pSubblock_pixels[p], subblock_colors[r], false);
 								best_err = minimum(best_err, err);
 								if (!best_err)
 									break;
@@ -1167,6 +1190,64 @@
 							if (total_err > best_cluster_err)
 								break;
 						} // p
+#else
+						if (m_params.m_perceptual)
+						{
+							if (!g_cpu_supports_sse41)
+							{
+								for (uint32_t p = 0; p < num_subblock_pixels; p++)
+								{
+									uint64_t best_err = UINT64_MAX;
+
+									for (uint32_t r = low_selector; r <= high_selector; r++)
+									{
+										uint64_t err = color_distance(true, pSubblock_pixels[p], subblock_colors[r], false);
+										best_err = minimum(best_err, err);
+										if (!best_err)
+											break;
+									}
+
+									total_err += best_err;
+									if (total_err > best_cluster_err)
+										break;
+								} // p
+							}
+							else
+							{
+#if BASISU_SUPPORT_SSE
+								find_lowest_error_perceptual_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
+#endif
+							}
+						}
+						else
+						{
+							if (!g_cpu_supports_sse41)
+							{
+								for (uint32_t p = 0; p < num_subblock_pixels; p++)
+								{
+									uint64_t best_err = UINT64_MAX;
+
+									for (uint32_t r = low_selector; r <= high_selector; r++)
+									{
+										uint64_t err = color_distance(false, pSubblock_pixels[p], subblock_colors[r], false);
+										best_err = minimum(best_err, err);
+										if (!best_err)
+											break;
+									}
+
+									total_err += best_err;
+									if (total_err > best_cluster_err)
+										break;
+								} // p
+							}
+							else
+							{
+#if BASISU_SUPPORT_SSE
+								find_lowest_error_linear_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
+#endif
+							}
+						}
+#endif
 
 					skip_cluster:
 						if ((total_err < best_cluster_err) ||
@@ -1194,7 +1275,7 @@
 		m_params.m_pJob_pool->wait_for_all();
 #endif
 
-		std::vector<typename std::vector<uint32_t> > optimized_endpoint_clusters(m_endpoint_clusters.size());
+		basisu::vector<typename basisu::vector<uint32_t> > optimized_endpoint_clusters(m_endpoint_clusters.size());
 		uint32_t total_subblocks_reassigned = 0;
 
 		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
@@ -1232,8 +1313,8 @@
 
 		indirect_sort((uint32_t)m_endpoint_clusters.size(), &sorted_endpoint_cluster_indices[0], &m_endpoint_cluster_etc_params[0]);
 
-		std::vector<std::vector<uint32_t> > new_endpoint_clusters(m_endpoint_clusters.size());
-		std::vector<endpoint_cluster_etc_params> new_subblock_etc_params(m_endpoint_clusters.size());
+		basisu::vector<basisu::vector<uint32_t> > new_endpoint_clusters(m_endpoint_clusters.size());
+		basisu::vector<endpoint_cluster_etc_params> new_subblock_etc_params(m_endpoint_clusters.size());
 		
 		for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)
 		{
@@ -1343,7 +1424,7 @@
 
 		for (int cluster_index = 0; cluster_index < static_cast<int>(m_selector_cluster_indices.size()); cluster_index++)
 		{
-			const std::vector<uint32_t>& cluster_indices = m_selector_cluster_indices[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_selector_cluster_indices[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -1437,7 +1518,9 @@
 		for (uint32_t i = 0; i < m_total_blocks; i++)
 			selector_clusterizer.add_training_vec(training_vecs[i].first, training_vecs[i].second);
 
-		const uint32_t parent_codebook_size = (m_params.m_max_selector_clusters >= 256) ? BASISU_SELECTOR_PARENT_CODEBOOK_SIZE : 0;
+		const int selector_parent_codebook_size = (m_params.m_compression_level <= 1) ? BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 : BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT;
+		const uint32_t parent_codebook_size = (m_params.m_max_selector_clusters >= 256) ? selector_parent_codebook_size : 0;
+		debug_printf("Using selector parent codebook size %u\n", parent_codebook_size);
 
 		uint32_t max_threads = 0;
 		max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;
@@ -1459,7 +1542,8 @@
 					m_selector_parent_cluster_indices[0].push_back(i);
 			}
 
-			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE <= UINT8_MAX);
+			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 <= UINT8_MAX);
+			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT <= UINT8_MAX);
 
 			m_block_parent_selector_cluster.resize(0);
 			m_block_parent_selector_cluster.resize(m_total_blocks);
@@ -1526,7 +1610,7 @@
 					
 					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 					{
-						const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
+						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
 
 						if (!cluster_block_indices.size())
 							continue;
@@ -1610,7 +1694,7 @@
 					
 					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 					{
-						const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
+						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
 
 						if (!cluster_block_indices.size())
 							continue;
@@ -1746,7 +1830,7 @@
 
 				for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_indices.size(); selector_cluster_index++)
 				{
-					const std::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[selector_cluster_index];
+					const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[selector_cluster_index];
 
 					for (uint32_t y = 0; y < 4; y++)
 						for (uint32_t x = 0; x < 4; x++)
@@ -1788,10 +1872,22 @@
 		}
 		else
 		{
-			std::vector< std::vector<uint32_t> > new_cluster_indices;
+			basisu::vector< basisu::vector<uint32_t> > new_cluster_indices;
 
 			// For each block: Determine which quantized selectors best encode that block, given its quantized endpoints.
 
+			basisu::vector<uint8_t> unpacked_optimized_cluster_selectors(16 * m_optimized_cluster_selectors.size());
+			for (uint32_t cluster_index = 0; cluster_index < m_optimized_cluster_selectors.size(); cluster_index++)
+			{
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						unpacked_optimized_cluster_selectors[cluster_index * 16 + y * 4 + x] = (uint8_t)m_optimized_cluster_selectors[cluster_index].get_selector(x, y);
+					}
+				}
+			}
+
 			const uint32_t N = 1024;
 			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 			{
@@ -1799,7 +1895,7 @@
 				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
 
 #ifndef __EMSCRIPTEN__
-				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &new_cluster_indices] {
+				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &new_cluster_indices, &unpacked_optimized_cluster_selectors] {
 #endif
 
 				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
@@ -1811,7 +1907,7 @@
 					color_rgba trial_block_colors[4];
 					blk.get_block_colors(trial_block_colors, 0);
 
-					uint64_t best_cluster_err = UINT64_MAX;
+					uint64_t best_cluster_err = INT64_MAX;
 					uint32_t best_cluster_index = 0;
 
 					const uint32_t parent_selector_cluster = m_block_parent_selector_cluster.size() ? m_block_parent_selector_cluster[block_index] : 0;
@@ -1819,12 +1915,13 @@
 
 					const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_indices.size();
 
+#if 0
 					for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
 					{
 						const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
 
 						const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
-								
+
 						uint64_t trial_err = 0;
 						for (int y = 0; y < 4; y++)
 						{
@@ -1837,18 +1934,82 @@
 									goto early_out;
 							}
 						}
-								
+
 						if (trial_err < best_cluster_err)
 						{
 							best_cluster_err = trial_err;
 							best_cluster_index = cluster_index;
-							if (!best_cluster_err) 
+							if (!best_cluster_err)
 								break;
 						}
 
 					early_out:
 						;
 					}
+#else
+					if (m_params.m_perceptual)
+					{
+						for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
+						{
+							const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+							//const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+
+							uint64_t trial_err = 0;
+																
+							for (int i = 0; i < 16; i++)
+							{
+								const uint32_t sel = unpacked_optimized_cluster_selectors[cluster_index * 16 + i];
+										
+								trial_err += color_distance(true, trial_block_colors[sel], pBlock_pixels[i], false);
+								if (trial_err > best_cluster_err)
+									goto early_out;
+							}
+
+							if (trial_err < best_cluster_err)
+							{
+								best_cluster_err = trial_err;
+								best_cluster_index = cluster_index;
+								if (!best_cluster_err)
+									break;
+							}
+
+						early_out:
+							;
+
+						} // cluster_iter
+					}
+					else
+					{
+						for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
+						{
+							const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+							//const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+
+							uint64_t trial_err = 0;
+
+							for (int i = 0; i < 16; i++)
+							{
+								const uint32_t sel = unpacked_optimized_cluster_selectors[cluster_index * 16 + i];
+
+								trial_err += color_distance(false, trial_block_colors[sel], pBlock_pixels[i], false);
+								if (trial_err > best_cluster_err)
+									goto early_out2;
+							}
+
+							if (trial_err < best_cluster_err)
+							{
+								best_cluster_err = trial_err;
+								best_cluster_index = cluster_index;
+								if (!best_cluster_err)
+									break;
+							}
+
+						early_out2:
+							;
+
+						} // cluster_iter
+					}
+#endif
 
 					blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());
 
@@ -1905,7 +2066,7 @@
 			const uint_vec &subblocks = subblock_params.m_subblocks;
 			//uint32_t total_pixels = subblock.m_subblocks.size() * 8;
 
-			std::vector<color_rgba> subblock_colors[2]; // [use_individual_mode]
+			basisu::vector<color_rgba> subblock_colors[2]; // [use_individual_mode]
 			uint8_vec subblock_selectors[2];
 
 			uint64_t cur_subblock_err[2] = { 0, 0 };
@@ -1945,7 +2106,7 @@
 
 			clear_obj(cluster_optimizer_results);
 
-			std::vector<uint8_t> cluster_selectors[2];
+			basisu::vector<uint8_t> cluster_selectors[2];
 
 			for (uint32_t use_individual_mode = 0; use_individual_mode < 2; use_individual_mode++)
 			{
@@ -2075,8 +2236,8 @@
 
 		uint32_t max_endpoint_cluster_size = 0;
 
-		std::vector<uint32_t> cluster_sizes(m_endpoint_clusters.size());
-		std::vector<uint32_t> sorted_cluster_indices(m_endpoint_clusters.size());
+		basisu::vector<uint32_t> cluster_sizes(m_endpoint_clusters.size());
+		basisu::vector<uint32_t> sorted_cluster_indices(m_endpoint_clusters.size());
 		for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)
 		{
 			max_endpoint_cluster_size = maximum<uint32_t>(max_endpoint_cluster_size, (uint32_t)m_endpoint_clusters[i].size());
@@ -2163,12 +2324,12 @@
 	{
 		debug_printf("reoptimize_remapped_endpoints\n");
 
-		std::vector<uint_vec> new_endpoint_cluster_block_indices(m_endpoint_clusters.size());
+		basisu::vector<uint_vec> new_endpoint_cluster_block_indices(m_endpoint_clusters.size());
 		for (uint32_t i = 0; i < new_block_endpoints.size(); i++)
 			new_endpoint_cluster_block_indices[new_block_endpoints[i]].push_back(i);
 
-		std::vector<uint8_t> cluster_valid(new_endpoint_cluster_block_indices.size());
-		std::vector<uint8_t> cluster_improved(new_endpoint_cluster_block_indices.size());
+		basisu::vector<uint8_t> cluster_valid(new_endpoint_cluster_block_indices.size());
+		basisu::vector<uint8_t> cluster_improved(new_endpoint_cluster_block_indices.size());
 		
 		const uint32_t N = 256;
 		for (uint32_t cluster_index_iter = 0; cluster_index_iter < new_endpoint_cluster_block_indices.size(); cluster_index_iter += N)
@@ -2182,14 +2343,14 @@
 
 				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 				{
-					const std::vector<uint32_t>& cluster_block_indices = new_endpoint_cluster_block_indices[cluster_index];
+					const basisu::vector<uint32_t>& cluster_block_indices = new_endpoint_cluster_block_indices[cluster_index];
 
 					if (!cluster_block_indices.size())
 						continue;
 
 					const uint32_t total_pixels = (uint32_t)cluster_block_indices.size() * 16;
 
-					std::vector<color_rgba> cluster_pixels(total_pixels);
+					basisu::vector<color_rgba> cluster_pixels(total_pixels);
 					uint8_vec force_selectors(total_pixels);
 
 					etc_block blk;
@@ -2236,16 +2397,19 @@
 
 						if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
 							cluster_optimizer_params.m_quality = cETCQualityUber;
+						else
+							cluster_optimizer_params.m_quality = cETCQualitySlow;
 
 						etc1_optimizer::results cluster_optimizer_results;
 
-						std::vector<uint8_t> cluster_selectors(total_pixels);
+						basisu::vector<uint8_t> cluster_selectors(total_pixels);
 						cluster_optimizer_results.m_n = total_pixels;
 						cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
 
 						optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
 
-						optimizer.compute();
+						if (!optimizer.compute())
+							BASISU_FRONTEND_VERIFY(false);
 
 						new_endpoint_cluster_etc_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
 						new_endpoint_cluster_etc_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
@@ -2310,7 +2474,7 @@
 
 			debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 1\n");
 
-			std::vector<uint_vec> new_endpoint_clusters(total_new_endpoint_clusters);
+			basisu::vector<uint_vec> new_endpoint_clusters(total_new_endpoint_clusters);
 
 			for (uint32_t block_index = 0; block_index < new_block_endpoints.size(); block_index++)
 			{
diff --git a/encoder/basisu_frontend.h b/encoder/basisu_frontend.h
index e5f7954..d5a4088 100644
--- a/encoder/basisu_frontend.h
+++ b/encoder/basisu_frontend.h
@@ -1,5 +1,5 @@
 // basisu_frontend.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -34,8 +34,8 @@
 		uint32_t &operator[] (uint32_t i) { assert(i < 2); return m_comps[i]; }
 	};
 
-	const uint32_t BASISU_DEFAULT_COMPRESSION_LEVEL = 1;
-	const uint32_t BASISU_MAX_COMPRESSION_LEVEL = 5;
+	const uint32_t BASISU_DEFAULT_COMPRESSION_LEVEL = 2;
+	const uint32_t BASISU_MAX_COMPRESSION_LEVEL = 6;
 
 	class basisu_frontend
 	{
@@ -190,16 +190,16 @@
 
 		// For each endpoint cluster: An array of which subblock indices (block_index*2+subblock) are located in that cluster.
 		// Array of block indices for each endpoint cluster
-		std::vector<uint_vec> m_endpoint_clusters; 
+		basisu::vector<uint_vec> m_endpoint_clusters;
 
 		// Array of block indices for each parent endpoint cluster
-		std::vector<uint_vec> m_endpoint_parent_clusters;  
+		basisu::vector<uint_vec> m_endpoint_parent_clusters;
 		
 		// Each block's parent cluster index
 		uint8_vec m_block_parent_endpoint_cluster; 
 
 		// Array of endpoint cluster indices for each parent endpoint cluster
-		std::vector<uint_vec> m_endpoint_clusters_within_each_parent_cluster; 
+		basisu::vector<uint_vec> m_endpoint_clusters_within_each_parent_cluster;
 				
 		struct endpoint_cluster_etc_params
 		{
@@ -269,35 +269,35 @@
 			}
 		};
 
-		typedef std::vector<endpoint_cluster_etc_params> cluster_subblock_etc_params_vec;
+		typedef basisu::vector<endpoint_cluster_etc_params> cluster_subblock_etc_params_vec;
 		
 		// Each endpoint cluster's ETC1S parameters 
 		cluster_subblock_etc_params_vec m_endpoint_cluster_etc_params;
 
 		// The endpoint cluster index used by each ETC1 subblock.
-		std::vector<vec2U> m_block_endpoint_clusters_indices;
+		basisu::vector<vec2U> m_block_endpoint_clusters_indices;
 				
 		// The block(s) within each selector cluster
 		// Note: If you add anything here that uses selector cluster indicies, be sure to update optimize_selector_codebook()!
-		std::vector<uint_vec> m_selector_cluster_indices;
+		basisu::vector<uint_vec> m_selector_cluster_indices;
 
 		// The selector bits for each selector cluster.
-		std::vector<etc_block> m_optimized_cluster_selectors;
+		basisu::vector<etc_block> m_optimized_cluster_selectors;
 
 		// The block(s) within each parent selector cluster.
-		std::vector<uint_vec> m_selector_parent_cluster_indices;
+		basisu::vector<uint_vec> m_selector_parent_cluster_indices;
 		
 		// Each block's parent selector cluster
 		uint8_vec m_block_parent_selector_cluster;
 
 		// Array of selector cluster indices for each parent selector cluster
-		std::vector<uint_vec> m_selector_clusters_within_each_parent_cluster; 
+		basisu::vector<uint_vec> m_selector_clusters_within_each_parent_cluster;
 
 		basist::etc1_global_selector_codebook_entry_id_vec m_optimized_cluster_selector_global_cb_ids;
 		bool_vec m_selector_cluster_uses_global_cb;
 
 		// Each block's selector cluster index
-		std::vector<uint32_t> m_block_selector_cluster_index;
+		basisu::vector<uint32_t> m_block_selector_cluster_index;
 
 		struct subblock_endpoint_quant_err
 		{
@@ -323,7 +323,7 @@
 		};
 
 		// The sorted subblock endpoint quant error for each endpoint cluster
-		std::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;
+		basisu::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;
 
 		std::mutex m_lock;
 
diff --git a/encoder/basisu_global_selector_palette_helpers.h b/encoder/basisu_global_selector_palette_helpers.h
index 9e9cdbd..7c35439 100644
--- a/encoder/basisu_global_selector_palette_helpers.h
+++ b/encoder/basisu_global_selector_palette_helpers.h
@@ -36,7 +36,7 @@
 
 		void clear() { clear_obj(*this); }
 	};
-	typedef std::vector<pixel_block> pixel_block_vec;
+	typedef basisu::vector<pixel_block> pixel_block_vec;
 
 	uint64_t etc1_global_selector_codebook_find_best_entry(const basist::etc1_global_selector_codebook &codebook,
 		uint32_t num_src_pixel_blocks, const pixel_block *pSrc_pixel_blocks, const etc_block *pBlock_endpoints,
diff --git a/encoder/basisu_gpu_texture.cpp b/encoder/basisu_gpu_texture.cpp
index a9e3d92..5ed2c0e 100644
--- a/encoder/basisu_gpu_texture.cpp
+++ b/encoder/basisu_gpu_texture.cpp
@@ -1,5 +1,5 @@
 // basisu_gpu_texture.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -1302,7 +1302,7 @@
 	};
 
 	// Input is a texture array of mipmapped gpu_image's: gpu_images[array_index][level_index]
-	bool create_ktx_texture_file(uint8_vec &ktx_data, const std::vector<gpu_image_vec>& gpu_images, bool cubemap_flag)
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag)
 	{
 		if (!gpu_images.size())
 		{
@@ -1545,7 +1545,7 @@
 		return true;
 	}
 
-	bool write_compressed_texture_file(const char* pFilename, const std::vector<gpu_image_vec>& g, bool cubemap_flag)
+	bool write_compressed_texture_file(const char* pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag)
 	{
 		std::string extension(string_tolower(string_get_extension(pFilename)));
 
@@ -1577,7 +1577,7 @@
 
 	bool write_compressed_texture_file(const char* pFilename, const gpu_image& g)
 	{
-		std::vector<gpu_image_vec> v;
+		basisu::vector<gpu_image_vec> v;
 		enlarge_vector(v, 1)->push_back(g);
 		return write_compressed_texture_file(pFilename, v, false);
 	}
diff --git a/encoder/basisu_gpu_texture.h b/encoder/basisu_gpu_texture.h
index 7ca9272..619926f 100644
--- a/encoder/basisu_gpu_texture.h
+++ b/encoder/basisu_gpu_texture.h
@@ -1,5 +1,5 @@
 // basisu_gpu_texture.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -114,17 +114,17 @@
 		uint64_vec m_blocks;
 	};
 
-	typedef std::vector<gpu_image> gpu_image_vec;
+	typedef basisu::vector<gpu_image> gpu_image_vec;
 
 	// KTX file writing
 
-	bool create_ktx_texture_file(uint8_vec &ktx_data, const std::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
 		
-	bool write_compressed_texture_file(const char *pFilename, const std::vector<gpu_image_vec>& g, bool cubemap_flag);
+	bool write_compressed_texture_file(const char *pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag);
 	
 	inline bool write_compressed_texture_file(const char *pFilename, const gpu_image_vec &g)
 	{
-		std::vector<gpu_image_vec> a;
+		basisu::vector<gpu_image_vec> a;
 		a.push_back(g);
 		return write_compressed_texture_file(pFilename, a, false);
 	}
diff --git a/encoder/basisu_miniz.h b/encoder/basisu_miniz.h
index f0da549..679af14 100644
--- a/encoder/basisu_miniz.h
+++ b/encoder/basisu_miniz.h
@@ -3,7 +3,7 @@
   
    Forked from the public domain/unlicense version at: https://code.google.com/archive/p/miniz/ 
    
-   Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+   Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/encoder/basisu_pvrtc1_4.cpp b/encoder/basisu_pvrtc1_4.cpp
index 330cea8..83c2052 100644
--- a/encoder/basisu_pvrtc1_4.cpp
+++ b/encoder/basisu_pvrtc1_4.cpp
@@ -1,5 +1,5 @@
 // basisu_pvrtc1_4.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/encoder/basisu_pvrtc1_4.h b/encoder/basisu_pvrtc1_4.h
index 1f17661..db6985a 100644
--- a/encoder/basisu_pvrtc1_4.h
+++ b/encoder/basisu_pvrtc1_4.h
@@ -1,5 +1,5 @@
 // basisu_pvrtc1_4.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/encoder/basisu_resample_filters.cpp b/encoder/basisu_resample_filters.cpp
index c0adb13..597cb3f 100644
--- a/encoder/basisu_resample_filters.cpp
+++ b/encoder/basisu_resample_filters.cpp
@@ -1,5 +1,5 @@
 // basisu_resampler_filters.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/encoder/basisu_uastc_enc.cpp b/encoder/basisu_uastc_enc.cpp
index 9b75b84..5dc7ea8 100644
--- a/encoder/basisu_uastc_enc.cpp
+++ b/encoder/basisu_uastc_enc.cpp
@@ -1,5 +1,5 @@
 // basisu_uastc_enc.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -3096,6 +3096,23 @@
 	const int32_t DEFAULT_BC7_ERROR_WEIGHT = 50;
 	const float UASTC_ERROR_THRESH = 1.3f;
 
+	// TODO: This is a quick hack to favor certain modes when we know we'll be followed up with an RDO postprocess.
+	static inline float get_uastc_mode_weight(uint32_t mode)
+	{
+		const float FAVORED_MODE_WEIGHT = .8f;
+
+		switch (mode)
+		{
+		case 0:
+		case 10:
+			return FAVORED_MODE_WEIGHT;
+		default:
+			break;
+		}
+
+		return 1.0f;
+	}
+
 	void encode_uastc(const uint8_t* pRGBAPixels, uastc_block& output_block, uint32_t flags)
 	{
 //		printf("encode_uastc: \n");
@@ -3481,9 +3498,13 @@
 				{
 					for (uint32_t i = 0; i < total_results; i++)
 					{
-						if (total_overall_err[i] < best_err)
+						// TODO: This is a quick hack to favor modes 0 or 10 for better RDO compression.
+						const float err_weight = (flags & cPackUASTCFavorSimplerModes) ? get_uastc_mode_weight(results[i].m_uastc_mode) : 1.0f;
+
+						const uint64_t w = (uint64_t)(total_overall_err[i] * err_weight);
+						if (w  < best_err)
 						{
-							best_err = total_overall_err[i];
+							best_err = w;
 							best_index = i;
 							if (!best_err)
 								break;
@@ -3499,9 +3520,13 @@
 
 						if (err_delta <= UASTC_ERROR_THRESH)
 						{
-							if (total_overall_err[i] < best_err)
+							// TODO: This is a quick hack to favor modes 0 or 10 for better RDO compression.
+							const float err_weight = (flags & cPackUASTCFavorSimplerModes) ? get_uastc_mode_weight(results[i].m_uastc_mode) : 1.0f;
+
+							const uint64_t w = (uint64_t)(total_overall_err[i] * err_weight);
+							if (w < best_err)
 							{
-								best_err = total_overall_err[i];
+								best_err = w;
 								best_index = i;
 								if (!best_err)
 									break;
@@ -3689,6 +3714,7 @@
 		pack_uastc(*pBlock, results, etc1_blk, etc1_bias, eac_a8_blk, bc1_hint0, bc1_hint1);
 		return true;
 	}
+
 	static const uint8_t g_uastc_mode_selector_bits[TOTAL_UASTC_MODES][2] =
 	{
 		{ 65, 63 }, { 69, 31 }, { 73, 46 }, { 89, 29 },
@@ -3697,6 +3723,7 @@
 		{ 81, 47 }, { 94, 30 }, { 92, 31 }, { 62, 63 },
 		{ 98, 30 }, { 61, 62 }, { 49, 79 }
 	};
+
 	static inline uint32_t set_block_bits(uint8_t* pBytes, uint64_t val, uint32_t num_bits, uint32_t cur_ofs)
 	{
 		assert(num_bits <= 64);
@@ -3714,6 +3741,7 @@
 		}
 		return cur_ofs;
 	}
+
 	static const uint8_t g_tdefl_small_dist_extra[512] =
 	{
 		0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -3725,12 +3753,14 @@
 		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
 		7, 7, 7, 7, 7, 7, 7, 7
 	};
+
 	static const uint8_t g_tdefl_large_dist_extra[128] =
 	{
 		0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 		12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
 		13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
 	};
+
 	static inline uint32_t compute_match_cost_estimate(uint32_t dist)
 	{
 		uint32_t len_cost = 7;
@@ -3748,6 +3778,7 @@
 		}
 		return len_cost + dist_cost;
 	}
+
 	struct selector_bitsequence
 	{
 		uint64_t m_sel;
@@ -3777,9 +3808,34 @@
 			return static_cast<std::size_t>(hash_hsieh((uint8_t *)&s, sizeof(s)) ^ s.m_sel);
 		}
 	};
+
+	class tracked_stat
+	{
+	public:
+		tracked_stat() { clear(); }
+
+		void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
+
+		void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; }
+
+		tracked_stat& operator += (uint32_t val) { update(val); return *this; }
+
+		uint32_t get_number_of_values() { return m_num; }
+		uint64_t get_total() const { return m_total; }
+		uint64_t get_total2() const { return m_total2; }
+
+		float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; };
+		float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
+		float get_variance() const { float s = get_std_dev(); return s * s; }
+
+	private:
+		uint32_t m_num;
+		uint64_t m_total;
+		uint64_t m_total2;
+	};
 		
 	static bool uastc_rdo_blocks(uint32_t first_index, uint32_t last_index, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params& params, uint32_t flags, 
-		uint32_t &total_skipped, uint32_t &total_refined, uint32_t &total_modified)
+		uint32_t &total_skipped, uint32_t &total_refined, uint32_t &total_modified, uint32_t &total_smooth)
 	{
 		debug_printf("uastc_rdo_blocks: Processing blocks %u to %u\n", first_index, last_index);
 
@@ -3801,6 +3857,24 @@
 			if (block_mode == UASTC_MODE_INDEX_SOLID_COLOR)
 				continue;
 
+			tracked_stat r_stats, g_stats, b_stats, a_stats;
+
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				r_stats.update(pPixels[i].r);
+				g_stats.update(pPixels[i].g);
+				b_stats.update(pPixels[i].b);
+				a_stats.update(pPixels[i].a);
+			}
+
+			const float max_std_dev = std::max<float>(std::max<float>(std::max(r_stats.get_std_dev(), g_stats.get_std_dev()), b_stats.get_std_dev()), a_stats.get_std_dev());
+									
+			float yl = clamp<float>(max_std_dev / params.m_max_smooth_block_std_dev, 0.0f, 1.0f);
+			yl = yl * yl;
+			const float smooth_block_error_scale = lerp<float>(params.m_smooth_block_max_error_scale, 1.0f, yl);
+			if (smooth_block_error_scale > 1.0f)
+				total_smooth++;
+
 			color_rgba decoded_uastc_block[4][4];
 			if (!unpack_uastc(unpacked_blk, (basist::color32*)decoded_uastc_block, false))
 				return false;
@@ -3819,7 +3893,7 @@
 
 			color_rgba decoded_b7_blk[4][4];
 			unpack_block(texture_format::cBC7, &b7_block, &decoded_b7_blk[0][0]);
-
+						
 			uint64_t bc7_err = 0;
 			for (uint32_t i = 0; i < 16; i++)
 				bc7_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_b7_blk)[i], true);
@@ -3827,7 +3901,8 @@
 			uint64_t cur_err = (uastc_err + bc7_err) / 2;
 
 			// Divide by 16*4 to compute RMS error
-			float cur_rms_err = sqrt((float)cur_err * (1.0f / 64.0f));
+			const float cur_ms_err = (float)cur_err * (1.0f / 64.0f);
+			const float cur_rms_err = sqrt(cur_ms_err);
 
 			const uint32_t first_sel_bit = g_uastc_mode_selector_bits[block_mode][0];
 			const uint32_t total_sel_bits = g_uastc_mode_selector_bits[block_mode][1];
@@ -3871,7 +3946,7 @@
 			basist::uastc_block best_block(blk);
 			uint32_t best_block_index = block_index;
 
-			float best_t = (cur_bits + cur_err * params.m_langrangian_multiplier) * params.m_quality_scaler;
+			float best_t = cur_ms_err * smooth_block_error_scale + cur_bits * params.m_lambda;
 
 			// Now scan through previous blocks, insert their selector bit patterns into the current block, and find 
 			// selector bit patterns which don't increase the overall block error too much.
@@ -3934,15 +4009,16 @@
 
 				uint64_t trial_err = (trial_uastc_err + trial_bc7_err) / 2;
 
-				float trial_rms_err = sqrtf((float)trial_err * (1.0f / 64.0f));
+				const float trial_ms_err = (float)trial_err * (1.0f / 64.0f);
+				const float trial_rms_err = sqrtf(trial_ms_err);
 
-				if (trial_rms_err > cur_rms_err* params.m_max_allowed_rms_increase_ratio)
+				if (trial_rms_err > cur_rms_err * params.m_max_allowed_rms_increase_ratio)
 					continue;
 
 				const int block_dist_in_bytes = (block_index - match_block_index) * 16;
 				const int match_bits = compute_match_cost_estimate(block_dist_in_bytes);
 
-				float t = match_bits + trial_err * params.m_langrangian_multiplier;
+				float t = trial_ms_err * smooth_block_error_scale + match_bits * params.m_lambda;
 				if (t < best_t)
 				{
 					best_t = t;
@@ -3953,7 +4029,7 @@
 
 			} // prev_block_index
 
-			if ((params.m_endpoint_refinement) && (best_block_index != block_index))
+			if (best_block_index != block_index)
 			{
 				total_modified++;
 
@@ -3961,7 +4037,7 @@
 				if (!unpack_uastc(best_block, unpacked_best_blk, false, false))
 					return false;
 
-				if (block_mode == 0)
+				if ((params.m_endpoint_refinement) && (block_mode == 0))
 				{
 					// Attempt to refine mode 0 block's endpoints, using the new selectors. This doesn't help much, but it does help.
 					// TODO: We could do this with the other modes too.
@@ -4013,13 +4089,16 @@
 
 						total_refined++;
 					}
-				}
+				} // if ((params.m_endpoint_refinement) && (block_mode == 0))
 
+				// The selectors have changed, so go recompute the block hints.
 				if (!uastc_recompute_hints(&best_block, pPixels, flags, &unpacked_best_blk))
 					return false;
 
+				// Write the modified block
 				pBlocks[block_index] = best_block;
-			}
+			
+			} // if (best_block_index != block_index)
 
 			{
 				uint32_t bit_offset = first_sel_bit;
@@ -4043,9 +4122,9 @@
 	{
 		assert(params.m_max_allowed_rms_increase_ratio > 1.0f);
 		assert(params.m_lz_dict_size > 0);
-		assert(params.m_quality_scaler > 0.0f);
+		assert(params.m_lambda > 0.0f);
 
-		uint32_t total_skipped = 0, total_modified = 0, total_refined = 0;
+		uint32_t total_skipped = 0, total_modified = 0, total_refined = 0, total_smooth = 0;
 
 		uint32_t blocks_per_job = total_jobs ? (num_blocks / total_jobs) : 0;
 
@@ -4055,7 +4134,7 @@
 
 		if ((!pJob_pool) || (total_jobs <= 1) || (blocks_per_job <= 8))
 		{
-			status = uastc_rdo_blocks(0, num_blocks, pBlocks, pBlock_pixels, params, flags, total_skipped, total_refined, total_modified);
+			status = uastc_rdo_blocks(0, num_blocks, pBlocks, pBlock_pixels, params, flags, total_skipped, total_refined, total_modified, total_smooth);
 		}
 		else
 		{
@@ -4067,12 +4146,12 @@
 				const uint32_t last_index = minimum<uint32_t>(num_blocks, block_index_iter + blocks_per_job);
 
 #ifndef __EMSCRIPTEN__
-				pJob_pool->add_job([first_index, last_index, pBlocks, pBlock_pixels, &params, flags, &total_skipped, &total_modified, &total_refined, &all_succeeded, &stat_mutex] {
+				pJob_pool->add_job([first_index, last_index, pBlocks, pBlock_pixels, &params, flags, &total_skipped, &total_modified, &total_refined, &total_smooth, &all_succeeded, &stat_mutex] {
 #endif
 
-					uint32_t job_skipped = 0, job_modified = 0, job_refined = 0;
+					uint32_t job_skipped = 0, job_modified = 0, job_refined = 0, job_smooth = 0;
 
-					bool status = uastc_rdo_blocks(first_index, last_index, pBlocks, pBlock_pixels, params, flags, job_skipped, job_refined, job_modified);
+					bool status = uastc_rdo_blocks(first_index, last_index, pBlocks, pBlock_pixels, params, flags, job_skipped, job_refined, job_modified, job_smooth);
 
 					{
 						std::lock_guard<std::mutex> lck(stat_mutex);
@@ -4081,6 +4160,7 @@
 						total_skipped += job_skipped;
 						total_modified += job_modified;
 						total_refined += job_refined;
+						total_smooth += job_smooth;
 					}
 
 #ifndef __EMSCRIPTEN__
@@ -4097,7 +4177,7 @@
 			status = all_succeeded;
 		}
 
-		debug_printf("uastc_rdo: Total modified: %3.2f%%, total skipped: %3.2f%%, total refined: %3.2f%%\n", total_modified * 100.0f / num_blocks, total_skipped * 100.0f / num_blocks, total_refined * 100.0f / num_blocks);
+		debug_printf("uastc_rdo: Total modified: %3.2f%%, total skipped: %3.2f%%, total refined: %3.2f%%, total smooth: %3.2f%%\n", total_modified * 100.0f / num_blocks, total_skipped * 100.0f / num_blocks, total_refined * 100.0f / num_blocks, total_smooth * 100.0f / num_blocks);
 				
 		return status;
 	}
diff --git a/encoder/basisu_uastc_enc.h b/encoder/basisu_uastc_enc.h
index eeb2525..ba39a55 100644
--- a/encoder/basisu_uastc_enc.h
+++ b/encoder/basisu_uastc_enc.h
@@ -1,5 +1,5 @@
 // basisu_uastc_enc.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -57,6 +57,9 @@
 		cPackUASTCETC1FasterHints = 64,
 		cPackUASTCETC1FastestHints = 128,
 		cPackUASTCETC1DisableFlipAndIndividual = 256,
+		
+		// Favor UASTC modes 0 and 10 more than the others (this is experimental, it's useful for RDO compression)
+		cPackUASTCFavorSimplerModes = 512, 
 	};
 
 	// pRGBAPixels: Pointer to source 4x4 block of RGBA pixels (R first in memory).
@@ -74,8 +77,19 @@
 	};
 			  
 	void pack_uastc(basist::uastc_block& blk, const uastc_encode_results& result, const etc_block& etc1_blk, uint32_t etc1_bias, const eac_a8_block& etc_eac_a8_blk, bool bc1_hint0, bool bc1_hint1);
+
+	const uint32_t UASCT_RDO_DEFAULT_LZ_DICT_SIZE = 4096;
+
 	const float UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO = 10.0f;
 	const float UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH = 8.0f;
+	
+	// The RDO encoder computes a smoothness factor, from [0,1], for each block. To do this it computes each block's maximum component variance, then it divides this by this factor and clamps the result.
+	// Larger values will result in more blocks being protected from too much distortion.
+	const float UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV = 18.0f;
+	
+	// The RDO encoder can artifically boost the error of smooth blocks, in order to suppress distortions on smooth areas of the texture.
+	// The encoder will use this value as the maximum error scale to use on smooth blocks. The larger this value, the better smooth bocks will look. Set to 1.0 to disable this completely.
+	const float UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE = 10.0f;
 
 	struct uastc_rdo_params
 	{
@@ -86,25 +100,23 @@
 
 		void clear()
 		{
-			m_quality_scaler = 1.0f;
-			m_lz_dict_size = 32768;
-			m_langrangian_multiplier = 0.025f;
+			m_lz_dict_size = UASCT_RDO_DEFAULT_LZ_DICT_SIZE;
+			m_lambda = 0.5f;
 			m_max_allowed_rms_increase_ratio = UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO;
 			m_skip_block_rms_thresh = UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH;
 			m_endpoint_refinement = true;
 			m_lz_literal_cost = 100;
+						
+			m_max_smooth_block_std_dev = UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV;
+			m_smooth_block_max_error_scale = UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE;
 		}
-
-		// m_quality_scaler: This value controls the overall quality vs. size tradeoff. Smaller values=larger/higher quality, 0=no change, larger values=smaller/lower quality.
-		// Good range to try is .2-2.5.
-		float m_quality_scaler;
-
+				
 		// m_lz_dict_size: Size of LZ dictionary to simulate in bytes. The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
 		uint32_t m_lz_dict_size;
 
-		// m_langrangian_multiplier: The post-processor tries to reduce rate+distortion*langrangian_mul (rate is approximate LZ bits and distortion is squared error).
-		// Larger values push the postprocessor towards optimizing more for lower distortion, and smaller values more for rate.
-		float m_langrangian_multiplier;
+		// m_lambda: The post-processor tries to reduce distortion+rate*lambda (rate is approximate LZ bits and distortion is scaled MS error).
+		// Larger values push the postprocessor towards optimizing more for lower rate, and smaller values more for distortion. 0=minimal distortion.
+		float m_lambda;
 		
 		// m_max_allowed_rms_increase_ratio: How much the RMS error of a block is allowed to increase before a trial is rejected. 1.0=no increase allowed, 1.05=5% increase allowed, etc.
 		float m_max_allowed_rms_increase_ratio;
@@ -115,6 +127,9 @@
 		// m_endpoint_refinement: If true, the post-process will attempt to refine the endpoints of blocks with modified selectors. 
 		bool m_endpoint_refinement;
 
+		float m_max_smooth_block_std_dev;
+		float m_smooth_block_max_error_scale;
+		
 		uint32_t m_lz_literal_cost;
 	};
 
diff --git a/transcoder/basisu.h b/transcoder/basisu.h
index 6e6f46d..9d8dfa6 100644
--- a/transcoder/basisu.h
+++ b/transcoder/basisu.h
@@ -1,5 +1,5 @@
 // basisu.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -63,10 +63,11 @@
 #include <functional>
 #include <iterator>
 #include <type_traits>
-#include <vector>
 #include <assert.h>
 #include <random>
 
+#include "basisu_containers.h"
+
 #ifdef max
 #undef max
 #endif
@@ -108,13 +109,13 @@
 	const char BASISU_PATH_SEPERATOR_CHAR = '/';
 #endif
 
-	typedef std::vector<uint8_t> uint8_vec;
-	typedef std::vector<int16_t> int16_vec;
-	typedef std::vector<uint16_t> uint16_vec;
-	typedef std::vector<uint32_t> uint_vec;
-	typedef std::vector<uint64_t> uint64_vec;
-	typedef std::vector<int> int_vec;
-	typedef std::vector<bool> bool_vec;
+	typedef basisu::vector<uint8_t> uint8_vec;
+	typedef basisu::vector<int16_t> int16_vec;
+	typedef basisu::vector<uint16_t> uint16_vec;
+	typedef basisu::vector<uint32_t> uint_vec;
+	typedef basisu::vector<uint64_t> uint64_vec;
+	typedef basisu::vector<int> int_vec;
+	typedef basisu::vector<bool> bool_vec;
 
 	void enable_debug_printf(bool enabled);
 	void debug_printf(const char *pFmt, ...);
diff --git a/transcoder/basisu_global_selector_palette.h b/transcoder/basisu_global_selector_palette.h
index 4c94b01..686393b 100644
--- a/transcoder/basisu_global_selector_palette.h
+++ b/transcoder/basisu_global_selector_palette.h
@@ -1,5 +1,5 @@
 // basisu_global_selector_palette.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -609,7 +609,7 @@
 		uint8_t m_selectors[16];
 	};
 
-	typedef std::vector<etc1_selector_palette_entry> etc1_selector_palette_entry_vec;
+	typedef basisu::vector<etc1_selector_palette_entry> etc1_selector_palette_entry_vec;
 
 	extern const uint32_t g_global_selector_cb[];
 	extern const uint32_t g_global_selector_cb_size;
@@ -628,7 +628,7 @@
 		void set(uint32_t palette_index, const etc1_global_palette_entry_modifier &modifier) { m_palette_index = palette_index; m_modifier = modifier; }
 	};
 
-	typedef std::vector<etc1_global_selector_codebook_entry_id> etc1_global_selector_codebook_entry_id_vec;
+	typedef basisu::vector<etc1_global_selector_codebook_entry_id> etc1_global_selector_codebook_entry_id_vec;
 
 	class etc1_global_selector_codebook
 	{
diff --git a/transcoder/basisu_transcoder.cpp b/transcoder/basisu_transcoder.cpp
index be96ae1..a0e0b74 100644
--- a/transcoder/basisu_transcoder.cpp
+++ b/transcoder/basisu_transcoder.cpp
@@ -1,5 +1,5 @@
 // basisu_transcoder.cpp
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 
 #include "basisu_transcoder.h"
 #include <limits.h>
-#include <vector>
+#include "basisu_containers_impl.h"
 
 #ifndef BASISD_IS_BIG_ENDIAN
 // TODO: This doesn't work on OSX. How can this be so difficult?
@@ -1342,8 +1342,8 @@
 		uint32_t m_base;
 		uint32_t m_table;
 		uint32_t m_multiplier;
-		std::vector<uint8_t> m_selectors;
-		std::vector<uint8_t> m_selectors_temp;
+		basisu::vector<uint8_t> m_selectors;
+		basisu::vector<uint8_t> m_selectors_temp;
 	};
 
 	static uint64_t pack_eac_a8_exhaustive(pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels)
@@ -1739,8 +1739,8 @@
 		uint32_t m_base;
 		uint32_t m_table;
 		uint32_t m_multiplier;
-		std::vector<uint8_t> m_selectors;
-		std::vector<uint8_t> m_selectors_temp;
+		basisu::vector<uint8_t> m_selectors;
+		basisu::vector<uint8_t> m_selectors_temp;
 	};
 
 	static uint64_t pack_eac_r11_exhaustive(pack_eac_r11_results& results, const uint8_t* pPixels, uint32_t num_pixels)
@@ -7914,7 +7914,7 @@
 				output_rows_in_pixels = orig_height;
 		}
 		
-		std::vector<uint32_t>* pPrev_frame_indices = nullptr;
+		basisu::vector<uint32_t>* pPrev_frame_indices = nullptr;
 		if (is_video)
 		{
 			// TODO: Add check to make sure the caller hasn't tried skipping past p-frames
@@ -8874,7 +8874,7 @@
 			assert(alpha_length);
 
 			// Temp buffer to hold alpha block endpoint/selector indices
-			std::vector<uint32_t> temp_block_indices(total_slice_blocks);
+			basisu::vector<uint32_t> temp_block_indices(total_slice_blocks);
 
 			// First transcode alpha data to temp buffer
 			//status = transcode_slice(pData, data_size, slice_index + 1, &temp_block_indices[0], total_slice_blocks, block_format::cIndices, sizeof(uint32_t), decode_flags, pSlice_descs[slice_index].m_num_blocks_x, pState);
@@ -16446,7 +16446,7 @@
 		if (!basisu::is_pow2(width) || !basisu::is_pow2(height))
 			return false;
 
-		std::vector<uint32_t> temp_endpoints(num_blocks_x * num_blocks_y);
+		basisu::vector<uint32_t> temp_endpoints(num_blocks_x * num_blocks_y);
 
 		for (uint32_t y = 0; y < num_blocks_y; y++)
 		{
@@ -16505,7 +16505,7 @@
 		if (!basisu::is_pow2(width) || !basisu::is_pow2(height))
 			return false;
 
-		std::vector<uint32_t> temp_endpoints(num_blocks_x * num_blocks_y);
+		basisu::vector<uint32_t> temp_endpoints(num_blocks_x * num_blocks_y);
 
 		for (uint32_t y = 0; y < num_blocks_y; y++)
 		{
diff --git a/transcoder/basisu_transcoder.h b/transcoder/basisu_transcoder.h
index c6051cd..1134a34 100644
--- a/transcoder/basisu_transcoder.h
+++ b/transcoder/basisu_transcoder.h
@@ -1,5 +1,5 @@
 // basisu_transcoder.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -155,10 +155,10 @@
 			uint8_t m_pred_bits;
 		};
 
-		std::vector<block_preds> m_block_endpoint_preds[2];
+		basisu::vector<block_preds> m_block_endpoint_preds[2];
 		
 		enum { cMaxPrevFrameLevels = 16 };
-		std::vector<uint32_t> m_prev_frame_indices[2][cMaxPrevFrameLevels]; // [alpha_flag][level_index] 
+		basisu::vector<uint32_t> m_prev_frame_indices[2][cMaxPrevFrameLevels]; // [alpha_flag][level_index] 
 	};
 	
 	// Low-level helper class that does the actual transcoding.
@@ -217,10 +217,10 @@
 		}
 
 	private:
-		typedef std::vector<endpoint> endpoint_vec;
+		typedef basisu::vector<endpoint> endpoint_vec;
 		endpoint_vec m_endpoints;
 
-		typedef std::vector<selector> selector_vec;
+		typedef basisu::vector<selector> selector_vec;
 		selector_vec m_selectors;
 
 		const etc1_global_selector_codebook *m_pGlobal_sel_codebook;
@@ -312,7 +312,7 @@
 		bool m_iframe_flag;		// true if the slice is an I-Frame
 	};
 
-	typedef std::vector<basisu_slice_info> basisu_slice_info_vec;
+	typedef basisu::vector<basisu_slice_info> basisu_slice_info_vec;
 
 	struct basisu_image_info
 	{
@@ -386,7 +386,7 @@
 		basisu_slice_info_vec m_slice_info;
 
 		uint32_t m_total_images;	 // total # of images
-		std::vector<uint32_t> m_image_mipmap_levels; // the # of mipmap levels for each image
+		basisu::vector<uint32_t> m_image_mipmap_levels; // the # of mipmap levels for each image
 
 		uint32_t m_userdata0;
 		uint32_t m_userdata1;
diff --git a/transcoder/basisu_transcoder_internal.h b/transcoder/basisu_transcoder_internal.h
index 80e43e6..9fdcbc3 100644
--- a/transcoder/basisu_transcoder_internal.h
+++ b/transcoder/basisu_transcoder_internal.h
@@ -1,5 +1,5 @@
 // basisu_transcoder_internal.h - Universal texture format transcoder library.
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
@@ -20,8 +20,8 @@
 #pragma warning (disable: 4127) //  conditional expression is constant
 #endif
 
-#define BASISD_LIB_VERSION 112
-#define BASISD_VERSION_STRING "01.12"
+#define BASISD_LIB_VERSION 113
+#define BASISD_VERSION_STRING "01.13"
 
 #ifdef _DEBUG
 #define BASISD_BUILD_DEBUG
diff --git a/webgl/encode_test/index.html b/webgl/encode_test/index.html
index 8be010c..ad74da1 100644
--- a/webgl/encode_test/index.html
+++ b/webgl/encode_test/index.html
@@ -440,9 +440,15 @@
 	
 	if (!uastcFlag)
 		log('Encoding at ETC1S quality level ' + qualityLevel);
+		
+	const startTime = performance.now();
 	
 	num_output_bytes = basisEncoder.encode(basisFileData);
 	
+	const elapsed = performance.now() - startTime;
+	
+	logTime('encoding time', elapsed.toFixed(2));
+	
 	var actualBasisFileData = new Uint8Array(basisFileData.buffer, 0, num_output_bytes);
 
 	basisEncoder.delete();
diff --git a/webgl/encoder/CMakeLists.txt b/webgl/encoder/CMakeLists.txt
index ed014a0..0e3c0d0 100644
--- a/webgl/encoder/CMakeLists.txt
+++ b/webgl/encoder/CMakeLists.txt
@@ -22,19 +22,20 @@
 	../../encoder/basisu_ssim.cpp                            
 	../../encoder/basisu_astc_decomp.cpp                     
 	../../encoder/basisu_uastc_enc.cpp                       
-	../../encoder/basisu_bc7enc.cpp                          
+	../../encoder/basisu_bc7enc.cpp
+	../../encoder/basisu_kernels_sse.cpp
 	../../encoder/lodepng.cpp                                
 	../../encoder/apg_bmp.c                                  
 	../../encoder/jpgd.cpp                                   
   )
 
-  #target_compile_definitions(basis_encoder.js PRIVATE NDEBUG BASISD_SUPPORT_UASTC=1 BASISD_SUPPORT_BC7=1 BASISD_SUPPORT_ATC=0 BASISD_SUPPORT_ASTC_HIGHER_OPAQUE_QUALITY=0 BASISD_SUPPORT_PVRTC2=0 BASISD_SUPPORT_FXT1=0 BASISD_SUPPORT_ETC2_EAC_RG11=0 BASISU_SUPPORT_ENCODING=1)
+  #target_compile_definitions(basis_encoder.js PRIVATE NDEBUG BASISD_SUPPORT_UASTC=1 BASISD_SUPPORT_BC7=1 BASISD_SUPPORT_ATC=0 BASISD_SUPPORT_ASTC_HIGHER_OPAQUE_QUALITY=0 BASISD_SUPPORT_PVRTC2=0 BASISD_SUPPORT_FXT1=0 BASISD_SUPPORT_ETC2_EAC_RG11=0 BASISU_SUPPORT_ENCODING=1 BASISU_SUPPORT_SSE=0)
   #target_compile_options(basis_encoder.js PRIVATE -fno-strict-aliasing -O3)
   
-  #target_compile_definitions(basis_encoder.js PRIVATE DEBUG BASISD_SUPPORT_UASTC=1 BASISD_SUPPORT_BC7=1 BASISD_SUPPORT_ATC=0 BASISD_SUPPORT_ASTC_HIGHER_OPAQUE_QUALITY=0 BASISD_SUPPORT_PVRTC2=0 BASISD_SUPPORT_FXT1=0 BASISD_SUPPORT_ETC2_EAC_RG11=0 BASISU_SUPPORT_ENCODING=1)
+  #target_compile_definitions(basis_encoder.js PRIVATE DEBUG BASISD_SUPPORT_UASTC=1 BASISD_SUPPORT_BC7=1 BASISD_SUPPORT_ATC=0 BASISD_SUPPORT_ASTC_HIGHER_OPAQUE_QUALITY=0 BASISD_SUPPORT_PVRTC2=0 BASISD_SUPPORT_FXT1=0 BASISD_SUPPORT_ETC2_EAC_RG11=0 BASISU_SUPPORT_ENCODING=1 BASISU_SUPPORT_SSE=0)
   #target_compile_options(basis_encoder.js PRIVATE -fno-strict-aliasing -g -O1 -fsanitize=undefined -fsanitize=address)
   
-  target_compile_definitions(basis_encoder.js PRIVATE NDEBUG BASISD_SUPPORT_UASTC=1 BASISD_SUPPORT_BC7=1 BASISD_SUPPORT_ATC=0 BASISD_SUPPORT_ASTC_HIGHER_OPAQUE_QUALITY=0 BASISD_SUPPORT_PVRTC2=0 BASISD_SUPPORT_FXT1=0 BASISD_SUPPORT_ETC2_EAC_RG11=0 BASISU_SUPPORT_ENCODING=1)
+  target_compile_definitions(basis_encoder.js PRIVATE NDEBUG BASISD_SUPPORT_UASTC=1 BASISD_SUPPORT_BC7=1 BASISD_SUPPORT_ATC=0 BASISD_SUPPORT_ASTC_HIGHER_OPAQUE_QUALITY=0 BASISD_SUPPORT_PVRTC2=0 BASISD_SUPPORT_FXT1=0 BASISD_SUPPORT_ETC2_EAC_RG11=0 BASISU_SUPPORT_ENCODING=1 BASISU_SUPPORT_SSE=0)
   target_compile_options(basis_encoder.js PRIVATE -fno-strict-aliasing -O3)
   
   target_include_directories(basis_encoder.js PRIVATE ../../transcoder)
diff --git a/webgl/transcoder/basis_wrappers.cpp b/webgl/transcoder/basis_wrappers.cpp
index 8056b0d..5f77a26 100644
--- a/webgl/transcoder/basis_wrappers.cpp
+++ b/webgl/transcoder/basis_wrappers.cpp
@@ -60,7 +60,7 @@
 	g_pGlobal_codebook = new basist::etc1_global_selector_codebook(g_global_selector_cb_size, g_global_selector_cb);
 }
 
-static void copy_from_jsbuffer(const emscripten::val& srcBuffer, std::vector<uint8_t>& dstVec)
+static void copy_from_jsbuffer(const emscripten::val& srcBuffer, basisu::vector<uint8_t>& dstVec)
 {
 	unsigned int length = srcBuffer["length"].as<unsigned int>();
 
@@ -73,7 +73,7 @@
 	memoryView.call<void>("set", srcBuffer);
 }
 
-static bool copy_to_jsbuffer(const emscripten::val& dstBuffer, const std::vector<uint8_t>& srcVec)
+static bool copy_to_jsbuffer(const emscripten::val& dstBuffer, const basisu::vector<uint8_t>& srcVec)
 {
 	if (srcVec.empty())
 	{
@@ -166,12 +166,12 @@
 {
 	int m_magic = 0;
 	basisu_transcoder m_transcoder;
-	std::vector<uint8_t> m_file;
+	basisu::vector<uint8_t> m_file;
 
 	basis_file(const emscripten::val& jsBuffer)
 		: m_file([&]() {
 		size_t byteLength = jsBuffer["byteLength"].as<size_t>();
-		return std::vector<uint8_t>(byteLength);
+		return basisu::vector<uint8_t>(byteLength);
 			}()),
 		m_transcoder(g_pGlobal_codebook)
 	{
@@ -422,7 +422,7 @@
 				if (!m_transcoder.get_image_level_desc(m_file.data(), m_file.size(), image_index, level_index, orig_width, orig_height, total_blocks))
 					return 0;
 
-				std::vector<uint8_t> dst_data;
+				basisu::vector<uint8_t> dst_data;
 
 				uint32_t flags = get_alpha_for_opaque_formats ? cDecodeFlagsTranscodeAlphaDataToOpaqueFormats : 0;
 
@@ -496,7 +496,7 @@
 			m_params.m_source_images.resize(slice_index + 1);
 
 		// First copy the src image buffer to the heap.
-		std::vector<uint8_t> src_image_buf;
+		basisu::vector<uint8_t> src_image_buf;
 		copy_from_jsbuffer(src_image_js_val, src_image_buf);
 
 		// Now extract the source image.
@@ -616,7 +616,7 @@
 	
 	bool decode_palettes(uint32_t num_endpoints, const emscripten::val& endpoint_data, uint32_t num_selectors, const emscripten::val& selector_data)
 	{
-		std::vector<uint8_t> temp_endpoint_data, temp_selector_data;
+		basisu::vector<uint8_t> temp_endpoint_data, temp_selector_data;
 		copy_from_jsbuffer(endpoint_data, temp_endpoint_data);
 		copy_from_jsbuffer(selector_data, temp_selector_data);
 
@@ -642,7 +642,7 @@
 	
 	bool decode_tables(const emscripten::val& table_data)
 	{
-		std::vector<uint8_t> temp_table_data;
+		basisu::vector<uint8_t> temp_table_data;
 		copy_from_jsbuffer(table_data, temp_table_data);
 		
 		if (!temp_table_data.size())
@@ -679,7 +679,7 @@
 		}
 		
 		// FIXME: Access the JavaScript buffer directly vs. copying it.
-		std::vector<uint8_t> temp_comp_data;
+		basisu::vector<uint8_t> temp_comp_data;
 		copy_from_jsbuffer(compressed_data, temp_comp_data);
 		
 		if (!temp_comp_data.size())
@@ -701,7 +701,7 @@
 			return false;
 		}
 		
-		std::vector<uint8_t> temp_output_blocks(output_blocks_len);
+		basisu::vector<uint8_t> temp_output_blocks(output_blocks_len);
 						
 		bool status = basisu_lowlevel_etc1s_transcoder::transcode_image(
 			(transcoder_texture_format)target_format,
@@ -757,7 +757,7 @@
 	}
 			
 	// FIXME: Access the JavaScript buffer directly vs. copying it.
-	std::vector<uint8_t> temp_comp_data;
+	basisu::vector<uint8_t> temp_comp_data;
 	copy_from_jsbuffer(compressed_data, temp_comp_data);
 	
 	if (!temp_comp_data.size())
@@ -789,7 +789,7 @@
 	printf("has_alpha: %u is_video: %u\n", has_alpha, is_video);
 #endif	
 	
-	std::vector<uint8_t> temp_output_blocks(output_blocks_len);
+	basisu::vector<uint8_t> temp_output_blocks(output_blocks_len);
 	
 	basisu_lowlevel_uastc_transcoder transcoder;
 	
diff --git a/webgl_videotest/basis.js b/webgl_videotest/basis.js
index 2612e08..73aaf58 100644
--- a/webgl_videotest/basis.js
+++ b/webgl_videotest/basis.js
@@ -1 +1 @@
-var Module=typeof Module!=="undefined"?Module:{};var moduleOverrides={};var key;for(key in Module){if(Module.hasOwnProperty(key)){moduleOverrides[key]=Module[key]}}var arguments_=[];var thisProgram="./this.program";var quit_=function(status,toThrow){throw toThrow};var ENVIRONMENT_IS_WEB=false;var ENVIRONMENT_IS_WORKER=false;var ENVIRONMENT_IS_NODE=false;var ENVIRONMENT_IS_SHELL=false;ENVIRONMENT_IS_WEB=typeof window==="object";ENVIRONMENT_IS_WORKER=typeof importScripts==="function";ENVIRONMENT_IS_NODE=typeof process==="object"&&typeof process.versions==="object"&&typeof process.versions.node==="string";ENVIRONMENT_IS_SHELL=!ENVIRONMENT_IS_WEB&&!ENVIRONMENT_IS_NODE&&!ENVIRONMENT_IS_WORKER;var scriptDirectory="";function locateFile(path){if(Module["locateFile"]){return Module["locateFile"](path,scriptDirectory)}return scriptDirectory+path}var read_,readAsync,readBinary,setWindowTitle;var nodeFS;var nodePath;if(ENVIRONMENT_IS_NODE){if(ENVIRONMENT_IS_WORKER){scriptDirectory=require("path").dirname(scriptDirectory)+"/"}else{scriptDirectory=__dirname+"/"}read_=function shell_read(filename,binary){if(!nodeFS)nodeFS=require("fs");if(!nodePath)nodePath=require("path");filename=nodePath["normalize"](filename);return nodeFS["readFileSync"](filename,binary?null:"utf8")};readBinary=function readBinary(filename){var ret=read_(filename,true);if(!ret.buffer){ret=new Uint8Array(ret)}assert(ret.buffer);return ret};if(process["argv"].length>1){thisProgram=process["argv"][1].replace(/\\/g,"/")}arguments_=process["argv"].slice(2);if(typeof module!=="undefined"){module["exports"]=Module}process["on"]("uncaughtException",function(ex){if(!(ex instanceof ExitStatus)){throw ex}});process["on"]("unhandledRejection",abort);quit_=function(status){process["exit"](status)};Module["inspect"]=function(){return"[Emscripten Module object]"}}else if(ENVIRONMENT_IS_SHELL){if(typeof read!="undefined"){read_=function shell_read(f){return read(f)}}readBinary=function readBinary(f){var data;if(typeof readbuffer==="function"){return new Uint8Array(readbuffer(f))}data=read(f,"binary");assert(typeof data==="object");return data};if(typeof scriptArgs!="undefined"){arguments_=scriptArgs}else if(typeof arguments!="undefined"){arguments_=arguments}if(typeof quit==="function"){quit_=function(status){quit(status)}}if(typeof print!=="undefined"){if(typeof console==="undefined")console={};console.log=print;console.warn=console.error=typeof printErr!=="undefined"?printErr:print}}else if(ENVIRONMENT_IS_WEB||ENVIRONMENT_IS_WORKER){if(ENVIRONMENT_IS_WORKER){scriptDirectory=self.location.href}else if(typeof document!=="undefined"&&document.currentScript){scriptDirectory=document.currentScript.src}if(scriptDirectory.indexOf("blob:")!==0){scriptDirectory=scriptDirectory.substr(0,scriptDirectory.lastIndexOf("/")+1)}else{scriptDirectory=""}{read_=function shell_read(url){var xhr=new XMLHttpRequest;xhr.open("GET",url,false);xhr.send(null);return xhr.responseText};if(ENVIRONMENT_IS_WORKER){readBinary=function readBinary(url){var xhr=new XMLHttpRequest;xhr.open("GET",url,false);xhr.responseType="arraybuffer";xhr.send(null);return new Uint8Array(xhr.response)}}readAsync=function readAsync(url,onload,onerror){var xhr=new XMLHttpRequest;xhr.open("GET",url,true);xhr.responseType="arraybuffer";xhr.onload=function xhr_onload(){if(xhr.status==200||xhr.status==0&&xhr.response){onload(xhr.response);return}onerror()};xhr.onerror=onerror;xhr.send(null)}}setWindowTitle=function(title){document.title=title}}else{}var out=Module["print"]||console.log.bind(console);var err=Module["printErr"]||console.warn.bind(console);for(key in moduleOverrides){if(moduleOverrides.hasOwnProperty(key)){Module[key]=moduleOverrides[key]}}moduleOverrides=null;if(Module["arguments"])arguments_=Module["arguments"];if(Module["thisProgram"])thisProgram=Module["thisProgram"];if(Module["quit"])quit_=Module["quit"];var STACK_ALIGN=16;function warnOnce(text){if(!warnOnce.shown)warnOnce.shown={};if(!warnOnce.shown[text]){warnOnce.shown[text]=1;err(text)}}function convertJsFunctionToWasm(func,sig){if(typeof WebAssembly.Function==="function"){var typeNames={"i":"i32","j":"i64","f":"f32","d":"f64"};var type={parameters:[],results:sig[0]=="v"?[]:[typeNames[sig[0]]]};for(var i=1;i<sig.length;++i){type.parameters.push(typeNames[sig[i]])}return new WebAssembly.Function(type,func)}var typeSection=[1,0,1,96];var sigRet=sig.slice(0,1);var sigParam=sig.slice(1);var typeCodes={"i":127,"j":126,"f":125,"d":124};typeSection.push(sigParam.length);for(var i=0;i<sigParam.length;++i){typeSection.push(typeCodes[sigParam[i]])}if(sigRet=="v"){typeSection.push(0)}else{typeSection=typeSection.concat([1,typeCodes[sigRet]])}typeSection[1]=typeSection.length-2;var bytes=new Uint8Array([0,97,115,109,1,0,0,0].concat(typeSection,[2,7,1,1,101,1,102,0,0,7,5,1,1,102,0,0]));var module=new WebAssembly.Module(bytes);var instance=new WebAssembly.Instance(module,{"e":{"f":func}});var wrappedFunc=instance.exports["f"];return wrappedFunc}var freeTableIndexes=[];var functionsInTableMap;function getEmptyTableSlot(){if(freeTableIndexes.length){return freeTableIndexes.pop()}try{wasmTable.grow(1)}catch(err){if(!(err instanceof RangeError)){throw err}throw"Unable to grow wasm table. Set ALLOW_TABLE_GROWTH."}return wasmTable.length-1}function addFunctionWasm(func,sig){if(!functionsInTableMap){functionsInTableMap=new WeakMap;for(var i=0;i<wasmTable.length;i++){var item=wasmTable.get(i);if(item){functionsInTableMap.set(item,i)}}}if(functionsInTableMap.has(func)){return functionsInTableMap.get(func)}var ret=getEmptyTableSlot();try{wasmTable.set(ret,func)}catch(err){if(!(err instanceof TypeError)){throw err}var wrapped=convertJsFunctionToWasm(func,sig);wasmTable.set(ret,wrapped)}functionsInTableMap.set(func,ret);return ret}var tempRet0=0;var wasmBinary;if(Module["wasmBinary"])wasmBinary=Module["wasmBinary"];var noExitRuntime;if(Module["noExitRuntime"])noExitRuntime=Module["noExitRuntime"];if(typeof WebAssembly!=="object"){abort("no native wasm support detected")}var wasmMemory;var ABORT=false;var EXITSTATUS;function assert(condition,text){if(!condition){abort("Assertion failed: "+text)}}function getCFunc(ident){var func=Module["_"+ident];assert(func,"Cannot call unknown function "+ident+", make sure it is exported");return func}function ccall(ident,returnType,argTypes,args,opts){var toC={"string":function(str){var ret=0;if(str!==null&&str!==undefined&&str!==0){var len=(str.length<<2)+1;ret=stackAlloc(len);stringToUTF8(str,ret,len)}return ret},"array":function(arr){var ret=stackAlloc(arr.length);writeArrayToMemory(arr,ret);return ret}};function convertReturnValue(ret){if(returnType==="string")return UTF8ToString(ret);if(returnType==="boolean")return Boolean(ret);return ret}var func=getCFunc(ident);var cArgs=[];var stack=0;if(args){for(var i=0;i<args.length;i++){var converter=toC[argTypes[i]];if(converter){if(stack===0)stack=stackSave();cArgs[i]=converter(args[i])}else{cArgs[i]=args[i]}}}var ret=func.apply(null,cArgs);ret=convertReturnValue(ret);if(stack!==0)stackRestore(stack);return ret}var ALLOC_STACK=1;var UTF8Decoder=typeof TextDecoder!=="undefined"?new TextDecoder("utf8"):undefined;function UTF8ArrayToString(heap,idx,maxBytesToRead){var endIdx=idx+maxBytesToRead;var endPtr=idx;while(heap[endPtr]&&!(endPtr>=endIdx))++endPtr;if(endPtr-idx>16&&heap.subarray&&UTF8Decoder){return UTF8Decoder.decode(heap.subarray(idx,endPtr))}else{var str="";while(idx<endPtr){var u0=heap[idx++];if(!(u0&128)){str+=String.fromCharCode(u0);continue}var u1=heap[idx++]&63;if((u0&224)==192){str+=String.fromCharCode((u0&31)<<6|u1);continue}var u2=heap[idx++]&63;if((u0&240)==224){u0=(u0&15)<<12|u1<<6|u2}else{u0=(u0&7)<<18|u1<<12|u2<<6|heap[idx++]&63}if(u0<65536){str+=String.fromCharCode(u0)}else{var ch=u0-65536;str+=String.fromCharCode(55296|ch>>10,56320|ch&1023)}}}return str}function UTF8ToString(ptr,maxBytesToRead){return ptr?UTF8ArrayToString(HEAPU8,ptr,maxBytesToRead):""}function stringToUTF8Array(str,heap,outIdx,maxBytesToWrite){if(!(maxBytesToWrite>0))return 0;var startIdx=outIdx;var endIdx=outIdx+maxBytesToWrite-1;for(var i=0;i<str.length;++i){var u=str.charCodeAt(i);if(u>=55296&&u<=57343){var u1=str.charCodeAt(++i);u=65536+((u&1023)<<10)|u1&1023}if(u<=127){if(outIdx>=endIdx)break;heap[outIdx++]=u}else if(u<=2047){if(outIdx+1>=endIdx)break;heap[outIdx++]=192|u>>6;heap[outIdx++]=128|u&63}else if(u<=65535){if(outIdx+2>=endIdx)break;heap[outIdx++]=224|u>>12;heap[outIdx++]=128|u>>6&63;heap[outIdx++]=128|u&63}else{if(outIdx+3>=endIdx)break;heap[outIdx++]=240|u>>18;heap[outIdx++]=128|u>>12&63;heap[outIdx++]=128|u>>6&63;heap[outIdx++]=128|u&63}}heap[outIdx]=0;return outIdx-startIdx}function stringToUTF8(str,outPtr,maxBytesToWrite){return stringToUTF8Array(str,HEAPU8,outPtr,maxBytesToWrite)}function lengthBytesUTF8(str){var len=0;for(var i=0;i<str.length;++i){var u=str.charCodeAt(i);if(u>=55296&&u<=57343)u=65536+((u&1023)<<10)|str.charCodeAt(++i)&1023;if(u<=127)++len;else if(u<=2047)len+=2;else if(u<=65535)len+=3;else len+=4}return len}var UTF16Decoder=typeof TextDecoder!=="undefined"?new TextDecoder("utf-16le"):undefined;function writeArrayToMemory(array,buffer){HEAP8.set(array,buffer)}function writeAsciiToMemory(str,buffer,dontAddNull){for(var i=0;i<str.length;++i){HEAP8[buffer++>>0]=str.charCodeAt(i)}if(!dontAddNull)HEAP8[buffer>>0]=0}var buffer,HEAP8,HEAPU8,HEAP16,HEAPU16,HEAP32,HEAPU32,HEAPF32,HEAPF64;function updateGlobalBufferAndViews(buf){buffer=buf;Module["HEAP8"]=HEAP8=new Int8Array(buf);Module["HEAP16"]=HEAP16=new Int16Array(buf);Module["HEAP32"]=HEAP32=new Int32Array(buf);Module["HEAPU8"]=HEAPU8=new Uint8Array(buf);Module["HEAPU16"]=HEAPU16=new Uint16Array(buf);Module["HEAPU32"]=HEAPU32=new Uint32Array(buf);Module["HEAPF32"]=HEAPF32=new Float32Array(buf);Module["HEAPF64"]=HEAPF64=new Float64Array(buf)}var INITIAL_MEMORY=Module["INITIAL_MEMORY"]||80019456;var wasmTable;var __ATPRERUN__=[];var __ATINIT__=[];var __ATMAIN__=[];var __ATPOSTRUN__=[];var runtimeInitialized=false;var runtimeExited=false;function preRun(){if(Module["preRun"]){if(typeof Module["preRun"]=="function")Module["preRun"]=[Module["preRun"]];while(Module["preRun"].length){addOnPreRun(Module["preRun"].shift())}}callRuntimeCallbacks(__ATPRERUN__)}function initRuntime(){runtimeInitialized=true;callRuntimeCallbacks(__ATINIT__)}function preMain(){callRuntimeCallbacks(__ATMAIN__)}function exitRuntime(){runtimeExited=true}function postRun(){if(Module["postRun"]){if(typeof Module["postRun"]=="function")Module["postRun"]=[Module["postRun"]];while(Module["postRun"].length){addOnPostRun(Module["postRun"].shift())}}callRuntimeCallbacks(__ATPOSTRUN__)}function addOnPreRun(cb){__ATPRERUN__.unshift(cb)}function addOnPostRun(cb){__ATPOSTRUN__.unshift(cb)}var runDependencies=0;var runDependencyWatcher=null;var dependenciesFulfilled=null;function addRunDependency(id){runDependencies++;if(Module["monitorRunDependencies"]){Module["monitorRunDependencies"](runDependencies)}}function removeRunDependency(id){runDependencies--;if(Module["monitorRunDependencies"]){Module["monitorRunDependencies"](runDependencies)}if(runDependencies==0){if(runDependencyWatcher!==null){clearInterval(runDependencyWatcher);runDependencyWatcher=null}if(dependenciesFulfilled){var callback=dependenciesFulfilled;dependenciesFulfilled=null;callback()}}}Module["preloadedImages"]={};Module["preloadedAudios"]={};function abort(what){if(Module["onAbort"]){Module["onAbort"](what)}what+="";err(what);ABORT=true;EXITSTATUS=1;what="abort("+what+"). Build with -s ASSERTIONS=1 for more info.";var e=new WebAssembly.RuntimeError(what);throw e}function hasPrefix(str,prefix){return String.prototype.startsWith?str.startsWith(prefix):str.indexOf(prefix)===0}var dataURIPrefix="data:application/octet-stream;base64,";function isDataURI(filename){return hasPrefix(filename,dataURIPrefix)}var fileURIPrefix="file://";function isFileURI(filename){return hasPrefix(filename,fileURIPrefix)}var wasmBinaryFile="basis.wasm";if(!isDataURI(wasmBinaryFile)){wasmBinaryFile=locateFile(wasmBinaryFile)}function getBinary(){try{if(wasmBinary){return new Uint8Array(wasmBinary)}if(readBinary){return readBinary(wasmBinaryFile)}else{throw"both async and sync fetching of the wasm failed"}}catch(err){abort(err)}}function getBinaryPromise(){if(!wasmBinary&&(ENVIRONMENT_IS_WEB||ENVIRONMENT_IS_WORKER)&&typeof fetch==="function"&&!isFileURI(wasmBinaryFile)){return fetch(wasmBinaryFile,{credentials:"same-origin"}).then(function(response){if(!response["ok"]){throw"failed to load wasm binary file at '"+wasmBinaryFile+"'"}return response["arrayBuffer"]()}).catch(function(){return getBinary()})}return Promise.resolve().then(getBinary)}function createWasm(){var info={"env":asmLibraryArg,"wasi_snapshot_preview1":asmLibraryArg};function receiveInstance(instance,module){var exports=instance.exports;Module["asm"]=exports;wasmMemory=Module["asm"]["memory"];updateGlobalBufferAndViews(wasmMemory.buffer);wasmTable=Module["asm"]["__indirect_function_table"];removeRunDependency("wasm-instantiate")}addRunDependency("wasm-instantiate");function receiveInstantiatedSource(output){receiveInstance(output["instance"])}function instantiateArrayBuffer(receiver){return getBinaryPromise().then(function(binary){return WebAssembly.instantiate(binary,info)}).then(receiver,function(reason){err("failed to asynchronously prepare wasm: "+reason);abort(reason)})}function instantiateAsync(){if(!wasmBinary&&typeof WebAssembly.instantiateStreaming==="function"&&!isDataURI(wasmBinaryFile)&&!isFileURI(wasmBinaryFile)&&typeof fetch==="function"){return fetch(wasmBinaryFile,{credentials:"same-origin"}).then(function(response){var result=WebAssembly.instantiateStreaming(response,info);return result.then(receiveInstantiatedSource,function(reason){err("wasm streaming compile failed: "+reason);err("falling back to ArrayBuffer instantiation");return instantiateArrayBuffer(receiveInstantiatedSource)})})}else{return instantiateArrayBuffer(receiveInstantiatedSource)}}if(Module["instantiateWasm"]){try{var exports=Module["instantiateWasm"](info,receiveInstance);return exports}catch(e){err("Module.instantiateWasm callback failed with error: "+e);return false}}instantiateAsync();return{}}var tempDouble;var tempI64;function callRuntimeCallbacks(callbacks){while(callbacks.length>0){var callback=callbacks.shift();if(typeof callback=="function"){callback(Module);continue}var func=callback.func;if(typeof func==="number"){if(callback.arg===undefined){wasmTable.get(func)()}else{wasmTable.get(func)(callback.arg)}}else{func(callback.arg===undefined?null:callback.arg)}}}function demangle(func){return func}function demangleAll(text){var regex=/\b_Z[\w\d_]+/g;return text.replace(regex,function(x){var y=demangle(x);return x===y?x:y+" ["+x+"]"})}function jsStackTrace(){var error=new Error;if(!error.stack){try{throw new Error}catch(e){error=e}if(!error.stack){return"(no stack trace available)"}}return error.stack.toString()}function ___assert_fail(condition,filename,line,func){abort("Assertion failed: "+UTF8ToString(condition)+", at: "+[filename?UTF8ToString(filename):"unknown filename",line,func?UTF8ToString(func):"unknown function"])}var ExceptionInfoAttrs={DESTRUCTOR_OFFSET:0,REFCOUNT_OFFSET:4,TYPE_OFFSET:8,CAUGHT_OFFSET:12,RETHROWN_OFFSET:13,SIZE:16};function ___cxa_allocate_exception(size){return _malloc(size+ExceptionInfoAttrs.SIZE)+ExceptionInfoAttrs.SIZE}function ExceptionInfo(excPtr){this.excPtr=excPtr;this.ptr=excPtr-ExceptionInfoAttrs.SIZE;this.set_type=function(type){HEAP32[this.ptr+ExceptionInfoAttrs.TYPE_OFFSET>>2]=type};this.get_type=function(){return HEAP32[this.ptr+ExceptionInfoAttrs.TYPE_OFFSET>>2]};this.set_destructor=function(destructor){HEAP32[this.ptr+ExceptionInfoAttrs.DESTRUCTOR_OFFSET>>2]=destructor};this.get_destructor=function(){return HEAP32[this.ptr+ExceptionInfoAttrs.DESTRUCTOR_OFFSET>>2]};this.set_refcount=function(refcount){HEAP32[this.ptr+ExceptionInfoAttrs.REFCOUNT_OFFSET>>2]=refcount};this.set_caught=function(caught){caught=caught?1:0;HEAP8[this.ptr+ExceptionInfoAttrs.CAUGHT_OFFSET>>0]=caught};this.get_caught=function(){return HEAP8[this.ptr+ExceptionInfoAttrs.CAUGHT_OFFSET>>0]!=0};this.set_rethrown=function(rethrown){rethrown=rethrown?1:0;HEAP8[this.ptr+ExceptionInfoAttrs.RETHROWN_OFFSET>>0]=rethrown};this.get_rethrown=function(){return HEAP8[this.ptr+ExceptionInfoAttrs.RETHROWN_OFFSET>>0]!=0};this.init=function(type,destructor){this.set_type(type);this.set_destructor(destructor);this.set_refcount(0);this.set_caught(false);this.set_rethrown(false)};this.add_ref=function(){var value=HEAP32[this.ptr+ExceptionInfoAttrs.REFCOUNT_OFFSET>>2];HEAP32[this.ptr+ExceptionInfoAttrs.REFCOUNT_OFFSET>>2]=value+1};this.release_ref=function(){var prev=HEAP32[this.ptr+ExceptionInfoAttrs.REFCOUNT_OFFSET>>2];HEAP32[this.ptr+ExceptionInfoAttrs.REFCOUNT_OFFSET>>2]=prev-1;return prev===1}}var exceptionLast=0;var uncaughtExceptionCount=0;function ___cxa_throw(ptr,type,destructor){var info=new ExceptionInfo(ptr);info.init(type,destructor);exceptionLast=ptr;uncaughtExceptionCount++;throw ptr}function _abort(){abort()}function _emscripten_memcpy_big(dest,src,num){HEAPU8.copyWithin(dest,src,src+num)}function abortOnCannotGrowMemory(requestedSize){abort("OOM")}function _emscripten_resize_heap(requestedSize){requestedSize=requestedSize>>>0;abortOnCannotGrowMemory(requestedSize)}var ASSERTIONS=false;__ATINIT__.push({func:function(){___wasm_call_ctors()}});var asmLibraryArg={"__assert_fail":___assert_fail,"__cxa_allocate_exception":___cxa_allocate_exception,"__cxa_throw":___cxa_throw,"abort":_abort,"emscripten_memcpy_big":_emscripten_memcpy_big,"emscripten_resize_heap":_emscripten_resize_heap};var asm=createWasm();var ___wasm_call_ctors=Module["___wasm_call_ctors"]=function(){return(___wasm_call_ctors=Module["___wasm_call_ctors"]=Module["asm"]["__wasm_call_ctors"]).apply(null,arguments)};var _malloc=Module["_malloc"]=function(){return(_malloc=Module["_malloc"]=Module["asm"]["malloc"]).apply(null,arguments)};var _free=Module["_free"]=function(){return(_free=Module["_free"]=Module["asm"]["free"]).apply(null,arguments)};var _basis_init=Module["_basis_init"]=function(){return(_basis_init=Module["_basis_init"]=Module["asm"]["basis_init"]).apply(null,arguments)};var _basis_open=Module["_basis_open"]=function(){return(_basis_open=Module["_basis_open"]=Module["asm"]["basis_open"]).apply(null,arguments)};var _basis_close=Module["_basis_close"]=function(){return(_basis_close=Module["_basis_close"]=Module["asm"]["basis_close"]).apply(null,arguments)};var _basis_get_has_alpha=Module["_basis_get_has_alpha"]=function(){return(_basis_get_has_alpha=Module["_basis_get_has_alpha"]=Module["asm"]["basis_get_has_alpha"]).apply(null,arguments)};var _basis_get_num_images=Module["_basis_get_num_images"]=function(){return(_basis_get_num_images=Module["_basis_get_num_images"]=Module["asm"]["basis_get_num_images"]).apply(null,arguments)};var _basis_get_num_levels=Module["_basis_get_num_levels"]=function(){return(_basis_get_num_levels=Module["_basis_get_num_levels"]=Module["asm"]["basis_get_num_levels"]).apply(null,arguments)};var _basis_get_image_width=Module["_basis_get_image_width"]=function(){return(_basis_get_image_width=Module["_basis_get_image_width"]=Module["asm"]["basis_get_image_width"]).apply(null,arguments)};var _basis_get_image_height=Module["_basis_get_image_height"]=function(){return(_basis_get_image_height=Module["_basis_get_image_height"]=Module["asm"]["basis_get_image_height"]).apply(null,arguments)};var _basis_get_image_transcoded_size_in_bytes=Module["_basis_get_image_transcoded_size_in_bytes"]=function(){return(_basis_get_image_transcoded_size_in_bytes=Module["_basis_get_image_transcoded_size_in_bytes"]=Module["asm"]["basis_get_image_transcoded_size_in_bytes"]).apply(null,arguments)};var _basis_start_transcoding=Module["_basis_start_transcoding"]=function(){return(_basis_start_transcoding=Module["_basis_start_transcoding"]=Module["asm"]["basis_start_transcoding"]).apply(null,arguments)};var _basis_transcode_image=Module["_basis_transcode_image"]=function(){return(_basis_transcode_image=Module["_basis_transcode_image"]=Module["asm"]["basis_transcode_image"]).apply(null,arguments)};var ___errno_location=Module["___errno_location"]=function(){return(___errno_location=Module["___errno_location"]=Module["asm"]["__errno_location"]).apply(null,arguments)};var stackSave=Module["stackSave"]=function(){return(stackSave=Module["stackSave"]=Module["asm"]["stackSave"]).apply(null,arguments)};var stackRestore=Module["stackRestore"]=function(){return(stackRestore=Module["stackRestore"]=Module["asm"]["stackRestore"]).apply(null,arguments)};var stackAlloc=Module["stackAlloc"]=function(){return(stackAlloc=Module["stackAlloc"]=Module["asm"]["stackAlloc"]).apply(null,arguments)};var _setThrew=Module["_setThrew"]=function(){return(_setThrew=Module["_setThrew"]=Module["asm"]["setThrew"]).apply(null,arguments)};var calledRun;function ExitStatus(status){this.name="ExitStatus";this.message="Program terminated with exit("+status+")";this.status=status}dependenciesFulfilled=function runCaller(){if(!calledRun)run();if(!calledRun)dependenciesFulfilled=runCaller};function run(args){args=args||arguments_;if(runDependencies>0){return}preRun();if(runDependencies>0)return;function doRun(){if(calledRun)return;calledRun=true;Module["calledRun"]=true;if(ABORT)return;initRuntime();preMain();if(Module["onRuntimeInitialized"])Module["onRuntimeInitialized"]();postRun()}if(Module["setStatus"]){Module["setStatus"]("Running...");setTimeout(function(){setTimeout(function(){Module["setStatus"]("")},1);doRun()},1)}else{doRun()}}Module["run"]=run;if(Module["preInit"]){if(typeof Module["preInit"]=="function")Module["preInit"]=[Module["preInit"]];while(Module["preInit"].length>0){Module["preInit"].pop()()}}noExitRuntime=true;run();
+var Module=typeof Module!=="undefined"?Module:{};var moduleOverrides={};var key;for(key in Module){if(Module.hasOwnProperty(key)){moduleOverrides[key]=Module[key]}}var arguments_=[];var thisProgram="./this.program";var quit_=function(status,toThrow){throw toThrow};var ENVIRONMENT_IS_WEB=false;var ENVIRONMENT_IS_WORKER=false;var ENVIRONMENT_IS_NODE=false;var ENVIRONMENT_IS_SHELL=false;ENVIRONMENT_IS_WEB=typeof window==="object";ENVIRONMENT_IS_WORKER=typeof importScripts==="function";ENVIRONMENT_IS_NODE=typeof process==="object"&&typeof process.versions==="object"&&typeof process.versions.node==="string";ENVIRONMENT_IS_SHELL=!ENVIRONMENT_IS_WEB&&!ENVIRONMENT_IS_NODE&&!ENVIRONMENT_IS_WORKER;var scriptDirectory="";function locateFile(path){if(Module["locateFile"]){return Module["locateFile"](path,scriptDirectory)}return scriptDirectory+path}var read_,readAsync,readBinary,setWindowTitle;var nodeFS;var nodePath;if(ENVIRONMENT_IS_NODE){if(ENVIRONMENT_IS_WORKER){scriptDirectory=require("path").dirname(scriptDirectory)+"/"}else{scriptDirectory=__dirname+"/"}read_=function shell_read(filename,binary){if(!nodeFS)nodeFS=require("fs");if(!nodePath)nodePath=require("path");filename=nodePath["normalize"](filename);return nodeFS["readFileSync"](filename,binary?null:"utf8")};readBinary=function readBinary(filename){var ret=read_(filename,true);if(!ret.buffer){ret=new Uint8Array(ret)}assert(ret.buffer);return ret};if(process["argv"].length>1){thisProgram=process["argv"][1].replace(/\\/g,"/")}arguments_=process["argv"].slice(2);if(typeof module!=="undefined"){module["exports"]=Module}process["on"]("uncaughtException",function(ex){if(!(ex instanceof ExitStatus)){throw ex}});process["on"]("unhandledRejection",abort);quit_=function(status){process["exit"](status)};Module["inspect"]=function(){return"[Emscripten Module object]"}}else if(ENVIRONMENT_IS_SHELL){if(typeof read!="undefined"){read_=function shell_read(f){return read(f)}}readBinary=function readBinary(f){var data;if(typeof readbuffer==="function"){return new Uint8Array(readbuffer(f))}data=read(f,"binary");assert(typeof data==="object");return data};if(typeof scriptArgs!="undefined"){arguments_=scriptArgs}else if(typeof arguments!="undefined"){arguments_=arguments}if(typeof quit==="function"){quit_=function(status){quit(status)}}if(typeof print!=="undefined"){if(typeof console==="undefined")console={};console.log=print;console.warn=console.error=typeof printErr!=="undefined"?printErr:print}}else if(ENVIRONMENT_IS_WEB||ENVIRONMENT_IS_WORKER){if(ENVIRONMENT_IS_WORKER){scriptDirectory=self.location.href}else if(typeof document!=="undefined"&&document.currentScript){scriptDirectory=document.currentScript.src}if(scriptDirectory.indexOf("blob:")!==0){scriptDirectory=scriptDirectory.substr(0,scriptDirectory.lastIndexOf("/")+1)}else{scriptDirectory=""}{read_=function shell_read(url){var xhr=new XMLHttpRequest;xhr.open("GET",url,false);xhr.send(null);return xhr.responseText};if(ENVIRONMENT_IS_WORKER){readBinary=function readBinary(url){var xhr=new XMLHttpRequest;xhr.open("GET",url,false);xhr.responseType="arraybuffer";xhr.send(null);return new Uint8Array(xhr.response)}}readAsync=function readAsync(url,onload,onerror){var xhr=new XMLHttpRequest;xhr.open("GET",url,true);xhr.responseType="arraybuffer";xhr.onload=function xhr_onload(){if(xhr.status==200||xhr.status==0&&xhr.response){onload(xhr.response);return}onerror()};xhr.onerror=onerror;xhr.send(null)}}setWindowTitle=function(title){document.title=title}}else{}var out=Module["print"]||console.log.bind(console);var err=Module["printErr"]||console.warn.bind(console);for(key in moduleOverrides){if(moduleOverrides.hasOwnProperty(key)){Module[key]=moduleOverrides[key]}}moduleOverrides=null;if(Module["arguments"])arguments_=Module["arguments"];if(Module["thisProgram"])thisProgram=Module["thisProgram"];if(Module["quit"])quit_=Module["quit"];var STACK_ALIGN=16;function warnOnce(text){if(!warnOnce.shown)warnOnce.shown={};if(!warnOnce.shown[text]){warnOnce.shown[text]=1;err(text)}}function convertJsFunctionToWasm(func,sig){if(typeof WebAssembly.Function==="function"){var typeNames={"i":"i32","j":"i64","f":"f32","d":"f64"};var type={parameters:[],results:sig[0]=="v"?[]:[typeNames[sig[0]]]};for(var i=1;i<sig.length;++i){type.parameters.push(typeNames[sig[i]])}return new WebAssembly.Function(type,func)}var typeSection=[1,0,1,96];var sigRet=sig.slice(0,1);var sigParam=sig.slice(1);var typeCodes={"i":127,"j":126,"f":125,"d":124};typeSection.push(sigParam.length);for(var i=0;i<sigParam.length;++i){typeSection.push(typeCodes[sigParam[i]])}if(sigRet=="v"){typeSection.push(0)}else{typeSection=typeSection.concat([1,typeCodes[sigRet]])}typeSection[1]=typeSection.length-2;var bytes=new Uint8Array([0,97,115,109,1,0,0,0].concat(typeSection,[2,7,1,1,101,1,102,0,0,7,5,1,1,102,0,0]));var module=new WebAssembly.Module(bytes);var instance=new WebAssembly.Instance(module,{"e":{"f":func}});var wrappedFunc=instance.exports["f"];return wrappedFunc}var freeTableIndexes=[];var functionsInTableMap;function getEmptyTableSlot(){if(freeTableIndexes.length){return freeTableIndexes.pop()}try{wasmTable.grow(1)}catch(err){if(!(err instanceof RangeError)){throw err}throw"Unable to grow wasm table. Set ALLOW_TABLE_GROWTH."}return wasmTable.length-1}function addFunctionWasm(func,sig){if(!functionsInTableMap){functionsInTableMap=new WeakMap;for(var i=0;i<wasmTable.length;i++){var item=wasmTable.get(i);if(item){functionsInTableMap.set(item,i)}}}if(functionsInTableMap.has(func)){return functionsInTableMap.get(func)}var ret=getEmptyTableSlot();try{wasmTable.set(ret,func)}catch(err){if(!(err instanceof TypeError)){throw err}var wrapped=convertJsFunctionToWasm(func,sig);wasmTable.set(ret,wrapped)}functionsInTableMap.set(func,ret);return ret}var tempRet0=0;var setTempRet0=function(value){tempRet0=value};var wasmBinary;if(Module["wasmBinary"])wasmBinary=Module["wasmBinary"];var noExitRuntime;if(Module["noExitRuntime"])noExitRuntime=Module["noExitRuntime"];if(typeof WebAssembly!=="object"){abort("no native wasm support detected")}var wasmMemory;var ABORT=false;var EXITSTATUS;function assert(condition,text){if(!condition){abort("Assertion failed: "+text)}}function getCFunc(ident){var func=Module["_"+ident];assert(func,"Cannot call unknown function "+ident+", make sure it is exported");return func}function ccall(ident,returnType,argTypes,args,opts){var toC={"string":function(str){var ret=0;if(str!==null&&str!==undefined&&str!==0){var len=(str.length<<2)+1;ret=stackAlloc(len);stringToUTF8(str,ret,len)}return ret},"array":function(arr){var ret=stackAlloc(arr.length);writeArrayToMemory(arr,ret);return ret}};function convertReturnValue(ret){if(returnType==="string")return UTF8ToString(ret);if(returnType==="boolean")return Boolean(ret);return ret}var func=getCFunc(ident);var cArgs=[];var stack=0;if(args){for(var i=0;i<args.length;i++){var converter=toC[argTypes[i]];if(converter){if(stack===0)stack=stackSave();cArgs[i]=converter(args[i])}else{cArgs[i]=args[i]}}}var ret=func.apply(null,cArgs);ret=convertReturnValue(ret);if(stack!==0)stackRestore(stack);return ret}var ALLOC_STACK=1;var UTF8Decoder=typeof TextDecoder!=="undefined"?new TextDecoder("utf8"):undefined;function UTF8ArrayToString(heap,idx,maxBytesToRead){var endIdx=idx+maxBytesToRead;var endPtr=idx;while(heap[endPtr]&&!(endPtr>=endIdx))++endPtr;if(endPtr-idx>16&&heap.subarray&&UTF8Decoder){return UTF8Decoder.decode(heap.subarray(idx,endPtr))}else{var str="";while(idx<endPtr){var u0=heap[idx++];if(!(u0&128)){str+=String.fromCharCode(u0);continue}var u1=heap[idx++]&63;if((u0&224)==192){str+=String.fromCharCode((u0&31)<<6|u1);continue}var u2=heap[idx++]&63;if((u0&240)==224){u0=(u0&15)<<12|u1<<6|u2}else{u0=(u0&7)<<18|u1<<12|u2<<6|heap[idx++]&63}if(u0<65536){str+=String.fromCharCode(u0)}else{var ch=u0-65536;str+=String.fromCharCode(55296|ch>>10,56320|ch&1023)}}}return str}function UTF8ToString(ptr,maxBytesToRead){return ptr?UTF8ArrayToString(HEAPU8,ptr,maxBytesToRead):""}function stringToUTF8Array(str,heap,outIdx,maxBytesToWrite){if(!(maxBytesToWrite>0))return 0;var startIdx=outIdx;var endIdx=outIdx+maxBytesToWrite-1;for(var i=0;i<str.length;++i){var u=str.charCodeAt(i);if(u>=55296&&u<=57343){var u1=str.charCodeAt(++i);u=65536+((u&1023)<<10)|u1&1023}if(u<=127){if(outIdx>=endIdx)break;heap[outIdx++]=u}else if(u<=2047){if(outIdx+1>=endIdx)break;heap[outIdx++]=192|u>>6;heap[outIdx++]=128|u&63}else if(u<=65535){if(outIdx+2>=endIdx)break;heap[outIdx++]=224|u>>12;heap[outIdx++]=128|u>>6&63;heap[outIdx++]=128|u&63}else{if(outIdx+3>=endIdx)break;heap[outIdx++]=240|u>>18;heap[outIdx++]=128|u>>12&63;heap[outIdx++]=128|u>>6&63;heap[outIdx++]=128|u&63}}heap[outIdx]=0;return outIdx-startIdx}function stringToUTF8(str,outPtr,maxBytesToWrite){return stringToUTF8Array(str,HEAPU8,outPtr,maxBytesToWrite)}function lengthBytesUTF8(str){var len=0;for(var i=0;i<str.length;++i){var u=str.charCodeAt(i);if(u>=55296&&u<=57343)u=65536+((u&1023)<<10)|str.charCodeAt(++i)&1023;if(u<=127)++len;else if(u<=2047)len+=2;else if(u<=65535)len+=3;else len+=4}return len}var UTF16Decoder=typeof TextDecoder!=="undefined"?new TextDecoder("utf-16le"):undefined;function writeArrayToMemory(array,buffer){HEAP8.set(array,buffer)}function writeAsciiToMemory(str,buffer,dontAddNull){for(var i=0;i<str.length;++i){HEAP8[buffer++>>0]=str.charCodeAt(i)}if(!dontAddNull)HEAP8[buffer>>0]=0}var buffer,HEAP8,HEAPU8,HEAP16,HEAPU16,HEAP32,HEAPU32,HEAPF32,HEAPF64;function updateGlobalBufferAndViews(buf){buffer=buf;Module["HEAP8"]=HEAP8=new Int8Array(buf);Module["HEAP16"]=HEAP16=new Int16Array(buf);Module["HEAP32"]=HEAP32=new Int32Array(buf);Module["HEAPU8"]=HEAPU8=new Uint8Array(buf);Module["HEAPU16"]=HEAPU16=new Uint16Array(buf);Module["HEAPU32"]=HEAPU32=new Uint32Array(buf);Module["HEAPF32"]=HEAPF32=new Float32Array(buf);Module["HEAPF64"]=HEAPF64=new Float64Array(buf)}var INITIAL_MEMORY=Module["INITIAL_MEMORY"]||749993984;var wasmTable;var __ATPRERUN__=[];var __ATINIT__=[];var __ATMAIN__=[];var __ATPOSTRUN__=[];var runtimeInitialized=false;var runtimeExited=false;function preRun(){if(Module["preRun"]){if(typeof Module["preRun"]=="function")Module["preRun"]=[Module["preRun"]];while(Module["preRun"].length){addOnPreRun(Module["preRun"].shift())}}callRuntimeCallbacks(__ATPRERUN__)}function initRuntime(){runtimeInitialized=true;callRuntimeCallbacks(__ATINIT__)}function preMain(){callRuntimeCallbacks(__ATMAIN__)}function exitRuntime(){runtimeExited=true}function postRun(){if(Module["postRun"]){if(typeof Module["postRun"]=="function")Module["postRun"]=[Module["postRun"]];while(Module["postRun"].length){addOnPostRun(Module["postRun"].shift())}}callRuntimeCallbacks(__ATPOSTRUN__)}function addOnPreRun(cb){__ATPRERUN__.unshift(cb)}function addOnPostRun(cb){__ATPOSTRUN__.unshift(cb)}var runDependencies=0;var runDependencyWatcher=null;var dependenciesFulfilled=null;function addRunDependency(id){runDependencies++;if(Module["monitorRunDependencies"]){Module["monitorRunDependencies"](runDependencies)}}function removeRunDependency(id){runDependencies--;if(Module["monitorRunDependencies"]){Module["monitorRunDependencies"](runDependencies)}if(runDependencies==0){if(runDependencyWatcher!==null){clearInterval(runDependencyWatcher);runDependencyWatcher=null}if(dependenciesFulfilled){var callback=dependenciesFulfilled;dependenciesFulfilled=null;callback()}}}Module["preloadedImages"]={};Module["preloadedAudios"]={};function abort(what){if(Module["onAbort"]){Module["onAbort"](what)}what+="";err(what);ABORT=true;EXITSTATUS=1;what="abort("+what+"). Build with -s ASSERTIONS=1 for more info.";var e=new WebAssembly.RuntimeError(what);throw e}function hasPrefix(str,prefix){return String.prototype.startsWith?str.startsWith(prefix):str.indexOf(prefix)===0}var dataURIPrefix="data:application/octet-stream;base64,";function isDataURI(filename){return hasPrefix(filename,dataURIPrefix)}var fileURIPrefix="file://";function isFileURI(filename){return hasPrefix(filename,fileURIPrefix)}var wasmBinaryFile="basis.wasm";if(!isDataURI(wasmBinaryFile)){wasmBinaryFile=locateFile(wasmBinaryFile)}function getBinary(){try{if(wasmBinary){return new Uint8Array(wasmBinary)}if(readBinary){return readBinary(wasmBinaryFile)}else{throw"both async and sync fetching of the wasm failed"}}catch(err){abort(err)}}function getBinaryPromise(){if(!wasmBinary&&(ENVIRONMENT_IS_WEB||ENVIRONMENT_IS_WORKER)&&typeof fetch==="function"&&!isFileURI(wasmBinaryFile)){return fetch(wasmBinaryFile,{credentials:"same-origin"}).then(function(response){if(!response["ok"]){throw"failed to load wasm binary file at '"+wasmBinaryFile+"'"}return response["arrayBuffer"]()}).catch(function(){return getBinary()})}return Promise.resolve().then(getBinary)}function createWasm(){var info={"env":asmLibraryArg,"wasi_snapshot_preview1":asmLibraryArg};function receiveInstance(instance,module){var exports=instance.exports;Module["asm"]=exports;wasmMemory=Module["asm"]["memory"];updateGlobalBufferAndViews(wasmMemory.buffer);wasmTable=Module["asm"]["__indirect_function_table"];removeRunDependency("wasm-instantiate")}addRunDependency("wasm-instantiate");function receiveInstantiatedSource(output){receiveInstance(output["instance"])}function instantiateArrayBuffer(receiver){return getBinaryPromise().then(function(binary){return WebAssembly.instantiate(binary,info)}).then(receiver,function(reason){err("failed to asynchronously prepare wasm: "+reason);abort(reason)})}function instantiateAsync(){if(!wasmBinary&&typeof WebAssembly.instantiateStreaming==="function"&&!isDataURI(wasmBinaryFile)&&!isFileURI(wasmBinaryFile)&&typeof fetch==="function"){return fetch(wasmBinaryFile,{credentials:"same-origin"}).then(function(response){var result=WebAssembly.instantiateStreaming(response,info);return result.then(receiveInstantiatedSource,function(reason){err("wasm streaming compile failed: "+reason);err("falling back to ArrayBuffer instantiation");return instantiateArrayBuffer(receiveInstantiatedSource)})})}else{return instantiateArrayBuffer(receiveInstantiatedSource)}}if(Module["instantiateWasm"]){try{var exports=Module["instantiateWasm"](info,receiveInstance);return exports}catch(e){err("Module.instantiateWasm callback failed with error: "+e);return false}}instantiateAsync();return{}}var tempDouble;var tempI64;function callRuntimeCallbacks(callbacks){while(callbacks.length>0){var callback=callbacks.shift();if(typeof callback=="function"){callback(Module);continue}var func=callback.func;if(typeof func==="number"){if(callback.arg===undefined){wasmTable.get(func)()}else{wasmTable.get(func)(callback.arg)}}else{func(callback.arg===undefined?null:callback.arg)}}}function demangle(func){return func}function demangleAll(text){var regex=/\b_Z[\w\d_]+/g;return text.replace(regex,function(x){var y=demangle(x);return x===y?x:y+" ["+x+"]"})}function jsStackTrace(){var error=new Error;if(!error.stack){try{throw new Error}catch(e){error=e}if(!error.stack){return"(no stack trace available)"}}return error.stack.toString()}function ___assert_fail(condition,filename,line,func){abort("Assertion failed: "+UTF8ToString(condition)+", at: "+[filename?UTF8ToString(filename):"unknown filename",line,func?UTF8ToString(func):"unknown function"])}function _abort(){abort()}function _emscripten_memcpy_big(dest,src,num){HEAPU8.copyWithin(dest,src,src+num)}function abortOnCannotGrowMemory(requestedSize){abort("OOM")}function _emscripten_resize_heap(requestedSize){requestedSize=requestedSize>>>0;abortOnCannotGrowMemory(requestedSize)}var SYSCALLS={mappings:{},buffers:[null,[],[]],printChar:function(stream,curr){var buffer=SYSCALLS.buffers[stream];if(curr===0||curr===10){(stream===1?out:err)(UTF8ArrayToString(buffer,0));buffer.length=0}else{buffer.push(curr)}},varargs:undefined,get:function(){SYSCALLS.varargs+=4;var ret=HEAP32[SYSCALLS.varargs-4>>2];return ret},getStr:function(ptr){var ret=UTF8ToString(ptr);return ret},get64:function(low,high){return low}};function _fd_close(fd){return 0}function _fd_seek(fd,offset_low,offset_high,whence,newOffset){}function _fd_write(fd,iov,iovcnt,pnum){var num=0;for(var i=0;i<iovcnt;i++){var ptr=HEAP32[iov+i*8>>2];var len=HEAP32[iov+(i*8+4)>>2];for(var j=0;j<len;j++){SYSCALLS.printChar(fd,HEAPU8[ptr+j])}num+=len}HEAP32[pnum>>2]=num;return 0}function _setTempRet0($i){setTempRet0($i|0)}var ASSERTIONS=false;__ATINIT__.push({func:function(){___wasm_call_ctors()}});var asmLibraryArg={"__assert_fail":___assert_fail,"abort":_abort,"emscripten_memcpy_big":_emscripten_memcpy_big,"emscripten_resize_heap":_emscripten_resize_heap,"fd_close":_fd_close,"fd_seek":_fd_seek,"fd_write":_fd_write,"setTempRet0":_setTempRet0};var asm=createWasm();var ___wasm_call_ctors=Module["___wasm_call_ctors"]=function(){return(___wasm_call_ctors=Module["___wasm_call_ctors"]=Module["asm"]["__wasm_call_ctors"]).apply(null,arguments)};var _malloc=Module["_malloc"]=function(){return(_malloc=Module["_malloc"]=Module["asm"]["malloc"]).apply(null,arguments)};var _free=Module["_free"]=function(){return(_free=Module["_free"]=Module["asm"]["free"]).apply(null,arguments)};var _basis_init=Module["_basis_init"]=function(){return(_basis_init=Module["_basis_init"]=Module["asm"]["basis_init"]).apply(null,arguments)};var _basis_open=Module["_basis_open"]=function(){return(_basis_open=Module["_basis_open"]=Module["asm"]["basis_open"]).apply(null,arguments)};var _basis_close=Module["_basis_close"]=function(){return(_basis_close=Module["_basis_close"]=Module["asm"]["basis_close"]).apply(null,arguments)};var _basis_get_has_alpha=Module["_basis_get_has_alpha"]=function(){return(_basis_get_has_alpha=Module["_basis_get_has_alpha"]=Module["asm"]["basis_get_has_alpha"]).apply(null,arguments)};var _basis_get_num_images=Module["_basis_get_num_images"]=function(){return(_basis_get_num_images=Module["_basis_get_num_images"]=Module["asm"]["basis_get_num_images"]).apply(null,arguments)};var _basis_get_num_levels=Module["_basis_get_num_levels"]=function(){return(_basis_get_num_levels=Module["_basis_get_num_levels"]=Module["asm"]["basis_get_num_levels"]).apply(null,arguments)};var _basis_get_image_width=Module["_basis_get_image_width"]=function(){return(_basis_get_image_width=Module["_basis_get_image_width"]=Module["asm"]["basis_get_image_width"]).apply(null,arguments)};var _basis_get_image_height=Module["_basis_get_image_height"]=function(){return(_basis_get_image_height=Module["_basis_get_image_height"]=Module["asm"]["basis_get_image_height"]).apply(null,arguments)};var _basis_get_image_transcoded_size_in_bytes=Module["_basis_get_image_transcoded_size_in_bytes"]=function(){return(_basis_get_image_transcoded_size_in_bytes=Module["_basis_get_image_transcoded_size_in_bytes"]=Module["asm"]["basis_get_image_transcoded_size_in_bytes"]).apply(null,arguments)};var _basis_start_transcoding=Module["_basis_start_transcoding"]=function(){return(_basis_start_transcoding=Module["_basis_start_transcoding"]=Module["asm"]["basis_start_transcoding"]).apply(null,arguments)};var _basis_transcode_image=Module["_basis_transcode_image"]=function(){return(_basis_transcode_image=Module["_basis_transcode_image"]=Module["asm"]["basis_transcode_image"]).apply(null,arguments)};var ___errno_location=Module["___errno_location"]=function(){return(___errno_location=Module["___errno_location"]=Module["asm"]["__errno_location"]).apply(null,arguments)};var stackSave=Module["stackSave"]=function(){return(stackSave=Module["stackSave"]=Module["asm"]["stackSave"]).apply(null,arguments)};var stackRestore=Module["stackRestore"]=function(){return(stackRestore=Module["stackRestore"]=Module["asm"]["stackRestore"]).apply(null,arguments)};var stackAlloc=Module["stackAlloc"]=function(){return(stackAlloc=Module["stackAlloc"]=Module["asm"]["stackAlloc"]).apply(null,arguments)};var dynCall_jiji=Module["dynCall_jiji"]=function(){return(dynCall_jiji=Module["dynCall_jiji"]=Module["asm"]["dynCall_jiji"]).apply(null,arguments)};var calledRun;function ExitStatus(status){this.name="ExitStatus";this.message="Program terminated with exit("+status+")";this.status=status}dependenciesFulfilled=function runCaller(){if(!calledRun)run();if(!calledRun)dependenciesFulfilled=runCaller};function run(args){args=args||arguments_;if(runDependencies>0){return}preRun();if(runDependencies>0)return;function doRun(){if(calledRun)return;calledRun=true;Module["calledRun"]=true;if(ABORT)return;initRuntime();preMain();if(Module["onRuntimeInitialized"])Module["onRuntimeInitialized"]();postRun()}if(Module["setStatus"]){Module["setStatus"]("Running...");setTimeout(function(){setTimeout(function(){Module["setStatus"]("")},1);doRun()},1)}else{doRun()}}Module["run"]=run;if(Module["preInit"]){if(typeof Module["preInit"]=="function")Module["preInit"]=[Module["preInit"]];while(Module["preInit"].length>0){Module["preInit"].pop()()}}noExitRuntime=true;run();
diff --git a/webgl_videotest/basis.wasm b/webgl_videotest/basis.wasm
index 4061f92..d16a23a 100644
--- a/webgl_videotest/basis.wasm
+++ b/webgl_videotest/basis.wasm
Binary files differ
diff --git a/webgl_videotest/build.bat b/webgl_videotest/build.bat
index 8f7ef80..fb828e6 100644
--- a/webgl_videotest/build.bat
+++ b/webgl_videotest/build.bat
@@ -1,4 +1,4 @@
 @REM -O0 -s ASSERTIONS=1 -s DEMANGLE_SUPPORT=1
 @REM -O2 -s ASSERTIONS=0
-emcc -s EXPORTED_FUNCTIONS="['allocate', '_malloc', '_free', '_basis_init','_basis_open','_basis_close','_basis_get_has_alpha','_basis_get_num_images','_basis_get_num_levels','_basis_get_image_width','_basis_get_image_height','_basis_get_image_transcoded_size_in_bytes','_basis_transcode_image','_basis_start_transcoding','_basis_stop_transcoding','_basis_get_debug_flags','_basis_set_debug_flags']" -o basis.js -DBASISD_ENABLE_DEBUG_FLAGS=1 ../transcoder/basisu_transcoder.cpp basis_wrappers.cpp -s TOTAL_MEMORY=499974144 -std=c++11 -O2 -s ASSERTIONS=0 -I ../transcoder
+emcc -s EXPORTED_FUNCTIONS="['allocate', '_malloc', '_free', '_basis_init','_basis_open','_basis_close','_basis_get_has_alpha','_basis_get_num_images','_basis_get_num_levels','_basis_get_image_width','_basis_get_image_height','_basis_get_image_transcoded_size_in_bytes','_basis_transcode_image','_basis_start_transcoding','_basis_stop_transcoding','_basis_get_debug_flags','_basis_set_debug_flags']" -o basis.js -DBASISD_ENABLE_DEBUG_FLAGS=1 ../transcoder/basisu_transcoder.cpp basis_wrappers.cpp -s TOTAL_MEMORY=749993984 -std=c++11 -O2 -s ASSERTIONS=0 -I ../transcoder
 rem emcc -s EXPORTED_FUNCTIONS="['allocate', '_malloc', '_free', '_basis_init','_basis_open','_basis_close','_basis_get_has_alpha','_basis_get_num_images','_basis_get_num_levels','_basis_get_image_width','_basis_get_image_height','_basis_get_image_transcoded_size_in_bytes','_basis_transcode_image','_basis_start_transcoding','_basis_stop_transcoding','_basis_get_debug_flags','_basis_set_debug_flags']" -o basis.js -DBASISD_ENABLE_DEBUG_FLAGS=1 ../transcoder/basisu_transcoder.cpp basis_wrappers.cpp -s TOTAL_MEMORY=499974144 -std=c++11 -O0 -s ASSERTIONS=0 -I ../transcoder
diff --git a/webgl_videotest/build.sh b/webgl_videotest/build.sh
index 0391dbf..2e78137 100644
--- a/webgl_videotest/build.sh
+++ b/webgl_videotest/build.sh
@@ -1,3 +1,3 @@
 # rg - I haven't tested this shell script yet (I use build.bat on Windows)
-emcc -s EXPORTED_FUNCTIONS="['allocate', '_malloc', '_free', '_basis_init','_basis_open','_basis_close','_basis_get_has_alpha','_basis_get_num_images','_basis_get_num_levels','_basis_get_image_width','_basis_get_image_height','_basis_get_image_transcoded_size_in_bytes','_basis_transcode_image','_basis_start_transcoding']" -s TOTAL_MEMORY=80019456 -O2 -s ASSERTIONS=0 -I ../transcoder -o basis.js ../transcoder/basisu_transcoder.cpp basis_wrappers.cpp
+emcc -s EXPORTED_FUNCTIONS="['allocate', '_malloc', '_free', '_basis_init','_basis_open','_basis_close','_basis_get_has_alpha','_basis_get_num_images','_basis_get_num_levels','_basis_get_image_width','_basis_get_image_height','_basis_get_image_transcoded_size_in_bytes','_basis_transcode_image','_basis_start_transcoding']" -s TOTAL_MEMORY=749993984 -O2 -s ASSERTIONS=0 -I ../transcoder -o basis.js ../transcoder/basisu_transcoder.cpp basis_wrappers.cpp
 # chmod -R a+rX .