Merge pull request #325 from past-due/trivially_copyable

Allow defining BASISU_HAVE_STD_TRIVIALLY_COPYABLE
diff --git a/README.md b/README.md
index 068959c..f5c593d 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,6 @@
 # basis_universal
 Basis Universal Supercompressed GPU Texture Codec
 
-<sub>(This software is available under a free and permissive license (Apache 2.0), but like many other large open source projects it needs financial support to sustain its continued security, bug fixes and improvements. In addition to maintenance and stability there are many desirable features and new interchange formats we would like to add. If your company is using Basis Universal in a product, please consider reaching out.)</sub>
-
-Businesses: support continued development and maintenance via invoiced technical support, maintenance, or sponsoring contracts:
-<br>&nbsp;&nbsp;_E-mail: info @ binomial dot info_ or contact us on [Twitter](https://twitter.com/_binomial)
-
-Also see the [Sponsors](https://github.com/BinomialLLC/basis_universal/wiki/Sponsors-and-Supporters) wiki page.
-
 ----
 
 [![Build status](https://ci.appveyor.com/api/projects/status/87eb0o96pjho4sh0?svg=true)](https://ci.appveyor.com/project/BinomialLLC/basis-universal)
@@ -57,6 +50,9 @@
 
 The encoder optionally uses Zstandard's single source file compressor (in zstd/zstd.c) to support compressing supercompressed KTX2 files. The stand-alone transcoder (in the "transcoder" directory) is a single .cpp source file library which has no 3rd party code dependencies apart from zstd/zstddeclib.c, which is also technically optional. It's only used for decompressing UASTC KTX2 files that use Zstandard.
 
+### Note about Visual Studio 2022 (compiling under Windows)
+The C/C++ compiler that ships with MSVC 2022 (Platform Toolset v143) is buggy, and it's been this way for years. I've reported the bugs and I've had others verify that yes, it produces buggy code. I've been doing my best to work around the 2022 compiler bugs, but it's probably not worth the effort. Either use the LLVM (clang-cl) Platform Toolset, or switch to Platform Toolset v142 (Visual Studio 2019's compiler). I test with various versions of gcc and clang, and MSVC 2019.
+
 ### Command Line Compression Tool
 
 The command line tool used to create, validate, and transcode/unpack .basis/.KTX2 files is named "basisu". Run basisu without any parameters for help. 
@@ -77,6 +73,14 @@
 
 For Visual Studio 2019, you can now either use the CMakeLists.txt file or the included `basisu.sln` file. Earlier versions of Visual Studio (particularly 2017) should work but aren't actively tested. We develop with the most up to date version of 2019.
 
+To test the codec:
+
+`basisu -test`
+
+To test the codec in OpenCL mode (must have OpenCL libs/headers/drivers installed, and have compiled OpenCL support in by specifying cmake -D OPENCL=TRUE):
+
+`basisu -test -opencl`
+
 To compress a sRGB PNG/BMP/TGA/JPEG image to an ETC1S .KTX2 file:
 
 `basisu -ktx2 x.png`
@@ -315,3 +319,9 @@
 To ensure continued REUSE compliance, run `reuse lint` at the root of
 a clean, checked-out repository periodically, or run it during CI tests
 before any build artifacts have been created.
+
+----
+
+E-mail: info @ binomial dot info, or contact us on [Twitter](https://twitter.com/_binomial)
+
+Here's the [Sponsors](https://github.com/BinomialLLC/basis_universal/wiki/Sponsors-and-Supporters) wiki page.
diff --git a/basisu.manifest b/basisu.manifest
new file mode 100644
index 0000000..b4baf6b
--- /dev/null
+++ b/basisu.manifest
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<assembly manifestVersion="1.0" xmlns="urn:schemas-microsoft-com:asm.v1">
+  <assemblyIdentity type="win32" name="..." version="6.0.0.0"/>
+  <application>
+    <windowsSettings>
+      <activeCodePage xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage>
+    </windowsSettings>
+  </application>
+</assembly>
+
diff --git a/basisu.vcxproj b/basisu.vcxproj
index 182ed30..df25600 100644
--- a/basisu.vcxproj
+++ b/basisu.vcxproj
@@ -40,13 +40,13 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v143</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v143</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
@@ -152,6 +152,7 @@
       <FloatingPointModel>Fast</FloatingPointModel>
       <OmitFramePointers>true</OmitFramePointers>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -228,6 +229,9 @@
     <None Include="transcoder\basisu_transcoder_tables_dxt1_5.inc" />
     <None Include="transcoder\basisu_transcoder_tables_dxt1_6.inc" />
   </ItemGroup>
+  <ItemGroup>
+    <Manifest Include="basisu.manifest" />
+  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
diff --git a/basisu.vcxproj.filters b/basisu.vcxproj.filters
index ee34dce..62107b9 100644
--- a/basisu.vcxproj.filters
+++ b/basisu.vcxproj.filters
@@ -198,4 +198,7 @@
       <UniqueIdentifier>{ab12ac82-9c39-494d-a36b-129dd1b2dec5}</UniqueIdentifier>
     </Filter>
   </ItemGroup>
+  <ItemGroup>
+    <Manifest Include="basisu.manifest" />
+  </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/basisu_tool.cpp b/basisu_tool.cpp
index 08f84e1..9f43e1c 100644
--- a/basisu_tool.cpp
+++ b/basisu_tool.cpp
@@ -32,6 +32,10 @@
 #define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
 #include "encoder/basisu_miniz.h"
 
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
 // Set BASISU_CATCH_EXCEPTIONS if you want exceptions to crash the app, otherwise main() catches them.
 #ifndef BASISU_CATCH_EXCEPTIONS
 	#define BASISU_CATCH_EXCEPTIONS 0
@@ -40,7 +44,7 @@
 using namespace basisu;
 using namespace buminiz;
 
-#define BASISU_TOOL_VERSION "1.16.3"
+#define BASISU_TOOL_VERSION "1.16.4"
 
 enum tool_mode
 {
@@ -54,6 +58,7 @@
 	cBench,
 	cCompSize,
 	cTest,
+	cCLBench,
 	cSplitImage,
 	cCombineImages
 };
@@ -365,6 +370,8 @@
 				m_mode = cCompSize;
 			else if (strcasecmp(pArg, "-test") == 0)
 				m_mode = cTest;
+			else if (strcasecmp(pArg, "-clbench") == 0)
+				m_mode = cCLBench;
 			else if (strcasecmp(pArg, "-test_dir") == 0)
 			{
 				REMAINING_ARGS_CHECK(1);
@@ -4266,7 +4273,7 @@
 		size_t data_size = 0;
 
 		// Test ETC1S
-		flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0);
+		flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagPrintStats | cFlagPrintStatus;
 
 		void* pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats);
 		if (!pData)
@@ -4293,7 +4300,7 @@
 
 		if (opencl_is_available())
 		{
-			flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUseOpenCL;
+			flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUseOpenCL | cFlagPrintStats | cFlagPrintStatus;
 
 			pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats);
 			if (!pData)
@@ -4329,7 +4336,7 @@
 		}
 
 		// Test UASTC
-		flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUASTC;
+		flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUASTC | cFlagPrintStats | cFlagPrintStatus;
 
 		pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats);
 		if (!pData)
@@ -4362,6 +4369,24 @@
 	return result;
 }
 
+static bool clbench_mode(command_line_params& opts)
+{
+	BASISU_NOTE_UNUSED(opts);
+	
+	bool opencl_failed = false;
+	bool use_cl = basis_benchmark_etc1s_opencl(&opencl_failed);
+	if (use_cl)
+		printf("OpenCL ETC1S encoding is faster on this machine\n");
+	else
+	{
+		if (opencl_failed)
+			printf("OpenCL failed!\n");
+		printf("CPU ETC1S encoding is faster on this machine\n");
+	}
+
+	return true;
+}
+
 static int main_internal(int argc, const char **argv)
 {
 	printf("Basis Universal GPU Texture Compressor v" BASISU_TOOL_VERSION "\nCopyright (C) 2019-2022 Binomial LLC, All rights reserved\n");
@@ -4374,7 +4399,7 @@
 	bool opencl_force_serialization = false;
 	for (int i = 1; i < argc; i++)
 	{
-		if (strcmp(argv[i], "-opencl") == 0)
+		if ((strcmp(argv[i], "-opencl") == 0) || (strcmp(argv[i], "-clbench") == 0))
 			use_opencl = true;
 		if (strcmp(argv[i], "-opencl_serialize") == 0)
 			opencl_force_serialization = true;
@@ -4394,13 +4419,13 @@
 #if defined(DEBUG) || defined(_DEBUG)
 	printf("DEBUG build\n");
 #endif
-
+		
 	if (argc == 1)
 	{
 		print_usage();
 		return EXIT_FAILURE;
 	}
-
+		
 	command_line_params opts;
 	if (!opts.parse(argc, argv))
 	{
@@ -4413,7 +4438,7 @@
 #else
 	printf("Multithreading: %u, Zstandard support: %u, OpenCL: %u\n", (uint32_t)opts.m_comp_params.m_multithreading, basist::basisu_transcoder_supports_ktx2_zstd(), opencl_is_available());
 #endif
-
+		
 	if (!opts.process_listing_files())
 		return EXIT_FAILURE;
 
@@ -4459,6 +4484,9 @@
 	case cTest:
 		status = test_mode(opts);
 		break;
+	case cCLBench:
+		status = clbench_mode(opts);
+		break;
 	case cSplitImage:
 		status = split_image_mode(opts);
 		break;
@@ -4475,6 +4503,9 @@
 
 int main(int argc, const char** argv)
 {
+#ifdef _WIN32
+	SetConsoleOutputCP(CP_UTF8);
+#endif
 #ifdef _DEBUG
 	printf("DEBUG\n");
 #endif
diff --git a/encoder/basisu_comp.cpp b/encoder/basisu_comp.cpp
index 166a1c4..41eae2b 100644
--- a/encoder/basisu_comp.cpp
+++ b/encoder/basisu_comp.cpp
@@ -1501,7 +1501,8 @@
 
 				if (m_params.m_compute_stats)
 				{
-					printf("Slice: %u\n", slice_index);
+					if (m_params.m_print_stats)
+						printf("Slice: %u\n", slice_index);
 
 					image_stats& s = m_stats[slice_index];
 
@@ -1511,81 +1512,100 @@
 
 					// ---- .basis stats
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3);
-					em.print(".basis RGB Avg:          ");
+					if (m_params.m_print_stats)
+						em.print(".basis RGB Avg:          ");
 					s.m_basis_rgb_avg_psnr = em.m_psnr;
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4);
-					em.print(".basis RGBA Avg:         ");
+					if (m_params.m_print_stats)
+						em.print(".basis RGBA Avg:         ");
 					s.m_basis_rgba_avg_psnr = em.m_psnr;
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1);
-					em.print(".basis R   Avg:          ");
+					if (m_params.m_print_stats)
+						em.print(".basis R   Avg:          ");
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1);
-					em.print(".basis G   Avg:          ");
+					if (m_params.m_print_stats)
+						em.print(".basis G   Avg:          ");
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1);
-					em.print(".basis B   Avg:          ");
+					if (m_params.m_print_stats)
+						em.print(".basis B   Avg:          ");
 
 					if (m_params.m_uastc)
 					{
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1);
-						em.print(".basis A   Avg:          ");
+						if (m_params.m_print_stats)
+							em.print(".basis A   Avg:          ");
 
 						s.m_basis_a_avg_psnr = em.m_psnr;
 					}
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0);
-					em.print(".basis 709 Luma:         ");
+					if (m_params.m_print_stats)
+						em.print(".basis 709 Luma:         ");
 					s.m_basis_luma_709_psnr = static_cast<float>(em.m_psnr);
 					s.m_basis_luma_709_ssim = static_cast<float>(em.m_ssim);
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true);
-					em.print(".basis 601 Luma:         ");
+					if (m_params.m_print_stats)
+						em.print(".basis 601 Luma:         ");
 					s.m_basis_luma_601_psnr = static_cast<float>(em.m_psnr);
 
 					if (m_slice_descs.size() == 1)
 					{
 						const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size();
-						debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
-						debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+						if (m_params.m_print_stats)
+						{
+							debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+							debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+						}
 					}
 
 					if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width())
 					{
 						// ---- BC7 stats
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3);
-						em.print("BC7 RGB Avg:             ");
+						if (m_params.m_print_stats)
+							em.print("BC7 RGB Avg:             ");
 						s.m_bc7_rgb_avg_psnr = em.m_psnr;
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4);
-						em.print("BC7 RGBA Avg:            ");
+						if (m_params.m_print_stats)
+							em.print("BC7 RGBA Avg:            ");
 						s.m_bc7_rgba_avg_psnr = em.m_psnr;
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1);
-						em.print("BC7 R   Avg:             ");
+						if (m_params.m_print_stats)
+							em.print("BC7 R   Avg:             ");
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1);
-						em.print("BC7 G   Avg:             ");
+						if (m_params.m_print_stats)
+							em.print("BC7 G   Avg:             ");
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1);
-						em.print("BC7 B   Avg:             ");
+						if (m_params.m_print_stats)
+							em.print("BC7 B   Avg:             ");
 
 						if (m_params.m_uastc)
 						{
 							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1);
-							em.print("BC7 A   Avg:             ");
+							if (m_params.m_print_stats)
+								em.print("BC7 A   Avg:             ");
 
 							s.m_bc7_a_avg_psnr = em.m_psnr;
 						}
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0);
-						em.print("BC7 709 Luma:            ");
+						if (m_params.m_print_stats)
+							em.print("BC7 709 Luma:            ");
 						s.m_bc7_luma_709_psnr = static_cast<float>(em.m_psnr);
 						s.m_bc7_luma_709_ssim = static_cast<float>(em.m_ssim);
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true);
-						em.print("BC7 601 Luma:            ");
+						if (m_params.m_print_stats)
+							em.print("BC7 601 Luma:            ");
 						s.m_bc7_luma_601_psnr = static_cast<float>(em.m_psnr);
 					}
 
@@ -1593,16 +1613,19 @@
 					{
 						// ---- Nearly best possible ETC1S stats
 						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3);
-						em.print("Unquantized ETC1S RGB Avg:     ");
+						if (m_params.m_print_stats)
+							em.print("Unquantized ETC1S RGB Avg:     ");
 						s.m_best_etc1s_rgb_avg_psnr = static_cast<float>(em.m_psnr);
 
 						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0);
-						em.print("Unquantized ETC1S 709 Luma:    ");
+						if (m_params.m_print_stats)
+							em.print("Unquantized ETC1S 709 Luma:    ");
 						s.m_best_etc1s_luma_709_psnr = static_cast<float>(em.m_psnr);
 						s.m_best_etc1s_luma_709_ssim = static_cast<float>(em.m_ssim);
 
 						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true);
-						em.print("Unquantized ETC1S 601 Luma:    ");
+						if (m_params.m_print_stats)
+							em.print("Unquantized ETC1S 601 Luma:    ");
 						s.m_best_etc1s_luma_601_psnr = static_cast<float>(em.m_psnr);
 					}
 				}
@@ -2311,6 +2334,8 @@
 		}
 
 		comp_params.m_compute_stats = (pStats != nullptr);
+		comp_params.m_print_stats = (flags_and_quality & cFlagPrintStats) != 0;
+		comp_params.m_status_output = (flags_and_quality & cFlagPrintStatus) != 0;
 
 		// Create the compressor, initialize it, and process the input
 		basis_compressor comp;
@@ -2328,6 +2353,11 @@
 			return nullptr;
 		}
 
+		if ((pStats) && (comp.get_opencl_failed()))
+		{
+			pStats->m_opencl_failed = true;
+		}
+
 		// Get the output file data and return it to the caller
 		void* pFile_data = nullptr;
 		const uint8_vec* pFile_data_vec = comp_params.m_create_ktx2_file ? &comp.get_output_ktx2_file() : &comp.get_output_basis_file();
@@ -2388,4 +2418,108 @@
 		free(p);
 	}
 
+	bool basis_benchmark_etc1s_opencl(bool* pOpenCL_failed)
+	{
+		if (pOpenCL_failed)
+			*pOpenCL_failed = false;
+
+		if (!opencl_is_available())
+		{
+			error_printf("basis_benchmark_etc1s_opencl: OpenCL support must be enabled first!\n");
+			return false;
+		}
+
+		const uint32_t W = 1024, H = 1024;
+		basisu::vector<image> images;
+		image& img = images.enlarge(1)->resize(W, H);
+		
+		const uint32_t NUM_RAND_LETTERS = 6000;// 40000;
+
+		rand r;
+		r.seed(200);
+
+		for (uint32_t i = 0; i < NUM_RAND_LETTERS; i++)
+		{
+			uint32_t x = r.irand(0, W - 1), y = r.irand(0, H - 1);
+			uint32_t sx = r.irand(1, 4), sy = r.irand(1, 4);
+			color_rgba c(r.byte(), r.byte(), r.byte(), 255);
+
+			img.debug_text(x, y, sx, sy, c, nullptr, false, "%c", static_cast<char>(r.irand(32, 127)));
+		}
+
+		//save_png("test.png", img);
+
+		image_stats stats;
+
+		uint32_t flags_and_quality = cFlagSRGB | cFlagThreaded | 255;
+		size_t comp_size = 0;
+
+		double best_cpu_time = 1e+9f, best_gpu_time = 1e+9f;
+
+		const uint32_t TIMES_TO_ENCODE = 2;
+		interval_timer tm;
+
+		for (uint32_t i = 0; i < TIMES_TO_ENCODE; i++)
+		{
+			tm.start();
+			void* pComp_data = basis_compress(
+				images,
+				flags_and_quality, 1.0f,
+				&comp_size,
+				&stats);
+			double cpu_time = tm.get_elapsed_secs();
+			if (!pComp_data)
+			{
+				error_printf("basis_benchmark_etc1s_opencl: basis_compress() failed (CPU)!\n");
+				return false;
+			}
+			
+			best_cpu_time = minimum(best_cpu_time, cpu_time);
+
+			basis_free_data(pComp_data);
+		}
+
+		printf("Best CPU time: %3.3f\n", best_cpu_time);
+
+		for (uint32_t i = 0; i < TIMES_TO_ENCODE; i++)
+		{
+			tm.start();
+			void* pComp_data = basis_compress(
+				images,
+				flags_and_quality | cFlagUseOpenCL, 1.0f,
+				&comp_size,
+				&stats);
+
+			if (stats.m_opencl_failed)
+			{
+				error_printf("basis_benchmark_etc1s_opencl: OpenCL failed!\n");
+
+				basis_free_data(pComp_data);
+
+				if (pOpenCL_failed)
+					*pOpenCL_failed = true;
+
+				return false;
+			}
+
+			double gpu_time = tm.get_elapsed_secs();
+			if (!pComp_data)
+			{
+				error_printf("basis_benchmark_etc1s_opencl: basis_compress() failed (GPU)!\n");
+				return false;
+			}
+
+			best_gpu_time = minimum(best_gpu_time, gpu_time);
+
+			basis_free_data(pComp_data);
+		}
+
+		printf("Best GPU time: %3.3f\n", best_gpu_time);
+				
+		return best_gpu_time < best_cpu_time;
+	}
+
 } // namespace basisu
+
+
+
diff --git a/encoder/basisu_comp.h b/encoder/basisu_comp.h
index aa5ea6f..b6c9fef 100644
--- a/encoder/basisu_comp.h
+++ b/encoder/basisu_comp.h
@@ -92,6 +92,8 @@
 			m_best_etc1s_luma_709_psnr = 0.0f;
 			m_best_etc1s_luma_601_psnr = 0.0f;
 			m_best_etc1s_luma_709_ssim = 0.0f;
+
+			m_opencl_failed = false;
 		}
 
 		std::string m_filename;
@@ -119,6 +121,8 @@
 		float m_best_etc1s_luma_709_psnr;
 		float m_best_etc1s_luma_601_psnr;
 		float m_best_etc1s_luma_709_ssim;
+
+		bool m_opencl_failed;
 	};
 
 	template<bool def>
@@ -255,6 +259,7 @@
 			m_write_output_basis_files.clear();
 			m_compression_level.clear();
 			m_compute_stats.clear();
+			m_print_stats.clear();
 			m_check_for_alpha.clear();
 			m_force_alpha.clear();
 			m_multithreading.clear();
@@ -373,6 +378,9 @@
 								
 		// Compute and display image metrics 
 		bool_param<false> m_compute_stats;
+
+		// Print stats to stdout, if m_compute_stats is true.
+		bool_param<true> m_print_stats;
 		
 		// Check to see if any input image has an alpha channel, if so then the output basis file will have alpha channels
 		bool_param<true> m_check_for_alpha;
@@ -583,11 +591,16 @@
 		cFlagYFlip = 1 << 16,		// flip source image on Y axis before compression
 		
 		cFlagUASTC = 1 << 17,		// use UASTC compression vs. ETC1S
-		cFlagUASTCRDO = 1 << 18		// use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar)
+		cFlagUASTCRDO = 1 << 18,		// use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar)
+		
+		cFlagPrintStats = 1 << 19,	// print image stats to stdout
+		cFlagPrintStatus = 1 << 20	// print status to stdout
 	};
 
 	// This function accepts an array of source images. 
 	// If more than one image is provided, it's assumed the images form a mipmap pyramid and automatic mipmap generation is disabled.
+	// Returns a pointer to the compressed .basis or .ktx2 file data. *pSize is the size of the compressed data. The returned block must be freed using basis_free_data().
+	// basisu_encoder_init() MUST be called first!
 	void* basis_compress(
 		const basisu::vector<image> &source_images,
 		uint32_t flags_and_quality, float uastc_rdo_quality,
@@ -604,6 +617,12 @@
 	// Frees the dynamically allocated file data returned by basis_compress().
 	void basis_free_data(void* p);
 
+	// Runs a short benchmark using synthetic image data to time OpenCL encoding vs. CPU encoding, with multithreading enabled.
+	// Returns true if opencl is worth using on this system, otherwise false.
+	// If pOpenCL_failed is not null, it will be set to true if OpenCL encoding failed *on this particular machine/driver/BasisU version* and the encoder falled back to CPU encoding.
+	// basisu_encoder_init() MUST be called first. If OpenCL support wasn't enabled this always returns false.
+	bool basis_benchmark_etc1s_opencl(bool *pOpenCL_failed = nullptr);
+
 	// Parallel compression API
 	struct parallel_results
 	{
diff --git a/encoder/basisu_enc.cpp b/encoder/basisu_enc.cpp
index 7e21d36..c431cea 100644
--- a/encoder/basisu_enc.cpp
+++ b/encoder/basisu_enc.cpp
@@ -229,7 +229,7 @@
 	{
 		QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*>(pTicks));
 	}
-#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__EMSCRIPTEN__)
 #include <sys/time.h>
 	inline void query_counter(timer_ticks* pTicks)
 	{
diff --git a/encoder/basisu_kernels_sse.cpp b/encoder/basisu_kernels_sse.cpp
index 4f15a5a..c368605 100644
--- a/encoder/basisu_kernels_sse.cpp
+++ b/encoder/basisu_kernels_sse.cpp
@@ -22,22 +22,6 @@
 #include <intrin.h>
 #endif
 
-#if !defined(_MSC_VER)
-	#if __AVX__ || __AVX2__ || __AVX512F__
-		#error Please check your compiler options
-	#endif
-	
-	#if CPPSPMD_SSE2
-		#if __SSE4_1__ || __SSE3__ || __SSE4_2__ || __SSSE3__
-			#error SSE4.1/SSE3/SSE4.2/SSSE3 cannot be enabled to use this file
-		#endif
-	#else
-		#if !__SSE4_1__ || !__SSE3__ || !__SSSE3__
-			#error Please check your compiler options
-		#endif
-	#endif
-#endif
-
 #include "cppspmd_sse.h"
 
 #include "cppspmd_type_aliases.h"
diff --git a/transcoder/basisu_transcoder_internal.h b/transcoder/basisu_transcoder_internal.h
index 776a998..0505df6 100644
--- a/transcoder/basisu_transcoder_internal.h
+++ b/transcoder/basisu_transcoder_internal.h
@@ -162,7 +162,7 @@
 				next_code[i + 1] = (total = ((total + syms_using_codesize[i]) << 1));
 			}
 
-			if (((1U << basisu::cHuffmanMaxSupportedInternalCodeSize) != total) && (used_syms > 1U))
+			if (((1U << basisu::cHuffmanMaxSupportedInternalCodeSize) != total) && (used_syms != 1U))
 				return false;
 
 			for (int tree_next = -1, sym_index = 0; sym_index < (int)total_syms; ++sym_index)