Adding -clbench command line option, and a new API basis_benchmark_etc1s_opencl() to determine if OpenCL encoding is worthwhile on the current machine/driver/GPU.
diff --git a/basisu_tool.cpp b/basisu_tool.cpp
index 08f84e1..9b2a43a 100644
--- a/basisu_tool.cpp
+++ b/basisu_tool.cpp
@@ -54,6 +54,7 @@
 	cBench,
 	cCompSize,
 	cTest,
+	cCLBench,
 	cSplitImage,
 	cCombineImages
 };
@@ -365,6 +366,8 @@
 				m_mode = cCompSize;
 			else if (strcasecmp(pArg, "-test") == 0)
 				m_mode = cTest;
+			else if (strcasecmp(pArg, "-clbench") == 0)
+				m_mode = cCLBench;
 			else if (strcasecmp(pArg, "-test_dir") == 0)
 			{
 				REMAINING_ARGS_CHECK(1);
@@ -4266,7 +4269,7 @@
 		size_t data_size = 0;
 
 		// Test ETC1S
-		flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0);
+		flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagPrintStats | cFlagPrintStatus;
 
 		void* pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats);
 		if (!pData)
@@ -4293,7 +4296,7 @@
 
 		if (opencl_is_available())
 		{
-			flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUseOpenCL;
+			flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUseOpenCL | cFlagPrintStats | cFlagPrintStatus;
 
 			pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats);
 			if (!pData)
@@ -4329,7 +4332,7 @@
 		}
 
 		// Test UASTC
-		flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUASTC;
+		flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUASTC | cFlagPrintStats | cFlagPrintStatus;
 
 		pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats);
 		if (!pData)
@@ -4362,6 +4365,24 @@
 	return result;
 }
 
+static bool clbench_mode(command_line_params& opts)
+{
+	BASISU_NOTE_UNUSED(opts);
+	
+	bool opencl_failed = false;
+	bool use_cl = basis_benchmark_etc1s_opencl(&opencl_failed);
+	if (use_cl)
+		printf("OpenCL ETC1S encoding is faster on this machine\n");
+	else
+	{
+		if (opencl_failed)
+			printf("OpenCL failed!\n");
+		printf("CPU ETC1S encoding is faster on this machine\n");
+	}
+
+	return true;
+}
+
 static int main_internal(int argc, const char **argv)
 {
 	printf("Basis Universal GPU Texture Compressor v" BASISU_TOOL_VERSION "\nCopyright (C) 2019-2022 Binomial LLC, All rights reserved\n");
@@ -4374,7 +4395,7 @@
 	bool opencl_force_serialization = false;
 	for (int i = 1; i < argc; i++)
 	{
-		if (strcmp(argv[i], "-opencl") == 0)
+		if ((strcmp(argv[i], "-opencl") == 0) || (strcmp(argv[i], "-clbench") == 0))
 			use_opencl = true;
 		if (strcmp(argv[i], "-opencl_serialize") == 0)
 			opencl_force_serialization = true;
@@ -4394,13 +4415,13 @@
 #if defined(DEBUG) || defined(_DEBUG)
 	printf("DEBUG build\n");
 #endif
-
+		
 	if (argc == 1)
 	{
 		print_usage();
 		return EXIT_FAILURE;
 	}
-
+		
 	command_line_params opts;
 	if (!opts.parse(argc, argv))
 	{
@@ -4413,7 +4434,7 @@
 #else
 	printf("Multithreading: %u, Zstandard support: %u, OpenCL: %u\n", (uint32_t)opts.m_comp_params.m_multithreading, basist::basisu_transcoder_supports_ktx2_zstd(), opencl_is_available());
 #endif
-
+		
 	if (!opts.process_listing_files())
 		return EXIT_FAILURE;
 
@@ -4459,6 +4480,9 @@
 	case cTest:
 		status = test_mode(opts);
 		break;
+	case cCLBench:
+		status = clbench_mode(opts);
+		break;
 	case cSplitImage:
 		status = split_image_mode(opts);
 		break;
diff --git a/encoder/basisu_comp.cpp b/encoder/basisu_comp.cpp
index 166a1c4..41eae2b 100644
--- a/encoder/basisu_comp.cpp
+++ b/encoder/basisu_comp.cpp
@@ -1501,7 +1501,8 @@
 
 				if (m_params.m_compute_stats)
 				{
-					printf("Slice: %u\n", slice_index);
+					if (m_params.m_print_stats)
+						printf("Slice: %u\n", slice_index);
 
 					image_stats& s = m_stats[slice_index];
 
@@ -1511,81 +1512,100 @@
 
 					// ---- .basis stats
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3);
-					em.print(".basis RGB Avg:          ");
+					if (m_params.m_print_stats)
+						em.print(".basis RGB Avg:          ");
 					s.m_basis_rgb_avg_psnr = em.m_psnr;
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4);
-					em.print(".basis RGBA Avg:         ");
+					if (m_params.m_print_stats)
+						em.print(".basis RGBA Avg:         ");
 					s.m_basis_rgba_avg_psnr = em.m_psnr;
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1);
-					em.print(".basis R   Avg:          ");
+					if (m_params.m_print_stats)
+						em.print(".basis R   Avg:          ");
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1);
-					em.print(".basis G   Avg:          ");
+					if (m_params.m_print_stats)
+						em.print(".basis G   Avg:          ");
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1);
-					em.print(".basis B   Avg:          ");
+					if (m_params.m_print_stats)
+						em.print(".basis B   Avg:          ");
 
 					if (m_params.m_uastc)
 					{
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1);
-						em.print(".basis A   Avg:          ");
+						if (m_params.m_print_stats)
+							em.print(".basis A   Avg:          ");
 
 						s.m_basis_a_avg_psnr = em.m_psnr;
 					}
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0);
-					em.print(".basis 709 Luma:         ");
+					if (m_params.m_print_stats)
+						em.print(".basis 709 Luma:         ");
 					s.m_basis_luma_709_psnr = static_cast<float>(em.m_psnr);
 					s.m_basis_luma_709_ssim = static_cast<float>(em.m_ssim);
 
 					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true);
-					em.print(".basis 601 Luma:         ");
+					if (m_params.m_print_stats)
+						em.print(".basis 601 Luma:         ");
 					s.m_basis_luma_601_psnr = static_cast<float>(em.m_psnr);
 
 					if (m_slice_descs.size() == 1)
 					{
 						const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size();
-						debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
-						debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+						if (m_params.m_print_stats)
+						{
+							debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+							debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+						}
 					}
 
 					if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width())
 					{
 						// ---- BC7 stats
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3);
-						em.print("BC7 RGB Avg:             ");
+						if (m_params.m_print_stats)
+							em.print("BC7 RGB Avg:             ");
 						s.m_bc7_rgb_avg_psnr = em.m_psnr;
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4);
-						em.print("BC7 RGBA Avg:            ");
+						if (m_params.m_print_stats)
+							em.print("BC7 RGBA Avg:            ");
 						s.m_bc7_rgba_avg_psnr = em.m_psnr;
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1);
-						em.print("BC7 R   Avg:             ");
+						if (m_params.m_print_stats)
+							em.print("BC7 R   Avg:             ");
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1);
-						em.print("BC7 G   Avg:             ");
+						if (m_params.m_print_stats)
+							em.print("BC7 G   Avg:             ");
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1);
-						em.print("BC7 B   Avg:             ");
+						if (m_params.m_print_stats)
+							em.print("BC7 B   Avg:             ");
 
 						if (m_params.m_uastc)
 						{
 							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1);
-							em.print("BC7 A   Avg:             ");
+							if (m_params.m_print_stats)
+								em.print("BC7 A   Avg:             ");
 
 							s.m_bc7_a_avg_psnr = em.m_psnr;
 						}
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0);
-						em.print("BC7 709 Luma:            ");
+						if (m_params.m_print_stats)
+							em.print("BC7 709 Luma:            ");
 						s.m_bc7_luma_709_psnr = static_cast<float>(em.m_psnr);
 						s.m_bc7_luma_709_ssim = static_cast<float>(em.m_ssim);
 
 						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true);
-						em.print("BC7 601 Luma:            ");
+						if (m_params.m_print_stats)
+							em.print("BC7 601 Luma:            ");
 						s.m_bc7_luma_601_psnr = static_cast<float>(em.m_psnr);
 					}
 
@@ -1593,16 +1613,19 @@
 					{
 						// ---- Nearly best possible ETC1S stats
 						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3);
-						em.print("Unquantized ETC1S RGB Avg:     ");
+						if (m_params.m_print_stats)
+							em.print("Unquantized ETC1S RGB Avg:     ");
 						s.m_best_etc1s_rgb_avg_psnr = static_cast<float>(em.m_psnr);
 
 						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0);
-						em.print("Unquantized ETC1S 709 Luma:    ");
+						if (m_params.m_print_stats)
+							em.print("Unquantized ETC1S 709 Luma:    ");
 						s.m_best_etc1s_luma_709_psnr = static_cast<float>(em.m_psnr);
 						s.m_best_etc1s_luma_709_ssim = static_cast<float>(em.m_ssim);
 
 						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true);
-						em.print("Unquantized ETC1S 601 Luma:    ");
+						if (m_params.m_print_stats)
+							em.print("Unquantized ETC1S 601 Luma:    ");
 						s.m_best_etc1s_luma_601_psnr = static_cast<float>(em.m_psnr);
 					}
 				}
@@ -2311,6 +2334,8 @@
 		}
 
 		comp_params.m_compute_stats = (pStats != nullptr);
+		comp_params.m_print_stats = (flags_and_quality & cFlagPrintStats) != 0;
+		comp_params.m_status_output = (flags_and_quality & cFlagPrintStatus) != 0;
 
 		// Create the compressor, initialize it, and process the input
 		basis_compressor comp;
@@ -2328,6 +2353,11 @@
 			return nullptr;
 		}
 
+		if ((pStats) && (comp.get_opencl_failed()))
+		{
+			pStats->m_opencl_failed = true;
+		}
+
 		// Get the output file data and return it to the caller
 		void* pFile_data = nullptr;
 		const uint8_vec* pFile_data_vec = comp_params.m_create_ktx2_file ? &comp.get_output_ktx2_file() : &comp.get_output_basis_file();
@@ -2388,4 +2418,108 @@
 		free(p);
 	}
 
+	bool basis_benchmark_etc1s_opencl(bool* pOpenCL_failed)
+	{
+		if (pOpenCL_failed)
+			*pOpenCL_failed = false;
+
+		if (!opencl_is_available())
+		{
+			error_printf("basis_benchmark_etc1s_opencl: OpenCL support must be enabled first!\n");
+			return false;
+		}
+
+		const uint32_t W = 1024, H = 1024;
+		basisu::vector<image> images;
+		image& img = images.enlarge(1)->resize(W, H);
+		
+		const uint32_t NUM_RAND_LETTERS = 6000;// 40000;
+
+		rand r;
+		r.seed(200);
+
+		for (uint32_t i = 0; i < NUM_RAND_LETTERS; i++)
+		{
+			uint32_t x = r.irand(0, W - 1), y = r.irand(0, H - 1);
+			uint32_t sx = r.irand(1, 4), sy = r.irand(1, 4);
+			color_rgba c(r.byte(), r.byte(), r.byte(), 255);
+
+			img.debug_text(x, y, sx, sy, c, nullptr, false, "%c", static_cast<char>(r.irand(32, 127)));
+		}
+
+		//save_png("test.png", img);
+
+		image_stats stats;
+
+		uint32_t flags_and_quality = cFlagSRGB | cFlagThreaded | 255;
+		size_t comp_size = 0;
+
+		double best_cpu_time = 1e+9f, best_gpu_time = 1e+9f;
+
+		const uint32_t TIMES_TO_ENCODE = 2;
+		interval_timer tm;
+
+		for (uint32_t i = 0; i < TIMES_TO_ENCODE; i++)
+		{
+			tm.start();
+			void* pComp_data = basis_compress(
+				images,
+				flags_and_quality, 1.0f,
+				&comp_size,
+				&stats);
+			double cpu_time = tm.get_elapsed_secs();
+			if (!pComp_data)
+			{
+				error_printf("basis_benchmark_etc1s_opencl: basis_compress() failed (CPU)!\n");
+				return false;
+			}
+			
+			best_cpu_time = minimum(best_cpu_time, cpu_time);
+
+			basis_free_data(pComp_data);
+		}
+
+		printf("Best CPU time: %3.3f\n", best_cpu_time);
+
+		for (uint32_t i = 0; i < TIMES_TO_ENCODE; i++)
+		{
+			tm.start();
+			void* pComp_data = basis_compress(
+				images,
+				flags_and_quality | cFlagUseOpenCL, 1.0f,
+				&comp_size,
+				&stats);
+
+			if (stats.m_opencl_failed)
+			{
+				error_printf("basis_benchmark_etc1s_opencl: OpenCL failed!\n");
+
+				basis_free_data(pComp_data);
+
+				if (pOpenCL_failed)
+					*pOpenCL_failed = true;
+
+				return false;
+			}
+
+			double gpu_time = tm.get_elapsed_secs();
+			if (!pComp_data)
+			{
+				error_printf("basis_benchmark_etc1s_opencl: basis_compress() failed (GPU)!\n");
+				return false;
+			}
+
+			best_gpu_time = minimum(best_gpu_time, gpu_time);
+
+			basis_free_data(pComp_data);
+		}
+
+		printf("Best GPU time: %3.3f\n", best_gpu_time);
+				
+		return best_gpu_time < best_cpu_time;
+	}
+
 } // namespace basisu
+
+
+
diff --git a/encoder/basisu_comp.h b/encoder/basisu_comp.h
index aa5ea6f..b6c9fef 100644
--- a/encoder/basisu_comp.h
+++ b/encoder/basisu_comp.h
@@ -92,6 +92,8 @@
 			m_best_etc1s_luma_709_psnr = 0.0f;
 			m_best_etc1s_luma_601_psnr = 0.0f;
 			m_best_etc1s_luma_709_ssim = 0.0f;
+
+			m_opencl_failed = false;
 		}
 
 		std::string m_filename;
@@ -119,6 +121,8 @@
 		float m_best_etc1s_luma_709_psnr;
 		float m_best_etc1s_luma_601_psnr;
 		float m_best_etc1s_luma_709_ssim;
+
+		bool m_opencl_failed;
 	};
 
 	template<bool def>
@@ -255,6 +259,7 @@
 			m_write_output_basis_files.clear();
 			m_compression_level.clear();
 			m_compute_stats.clear();
+			m_print_stats.clear();
 			m_check_for_alpha.clear();
 			m_force_alpha.clear();
 			m_multithreading.clear();
@@ -373,6 +378,9 @@
 								
 		// Compute and display image metrics 
 		bool_param<false> m_compute_stats;
+
+		// Print stats to stdout, if m_compute_stats is true.
+		bool_param<true> m_print_stats;
 		
 		// Check to see if any input image has an alpha channel, if so then the output basis file will have alpha channels
 		bool_param<true> m_check_for_alpha;
@@ -583,11 +591,16 @@
 		cFlagYFlip = 1 << 16,		// flip source image on Y axis before compression
 		
 		cFlagUASTC = 1 << 17,		// use UASTC compression vs. ETC1S
-		cFlagUASTCRDO = 1 << 18		// use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar)
+		cFlagUASTCRDO = 1 << 18,		// use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar)
+		
+		cFlagPrintStats = 1 << 19,	// print image stats to stdout
+		cFlagPrintStatus = 1 << 20	// print status to stdout
 	};
 
 	// This function accepts an array of source images. 
 	// If more than one image is provided, it's assumed the images form a mipmap pyramid and automatic mipmap generation is disabled.
+	// Returns a pointer to the compressed .basis or .ktx2 file data. *pSize is the size of the compressed data. The returned block must be freed using basis_free_data().
+	// basisu_encoder_init() MUST be called first!
 	void* basis_compress(
 		const basisu::vector<image> &source_images,
 		uint32_t flags_and_quality, float uastc_rdo_quality,
@@ -604,6 +617,12 @@
 	// Frees the dynamically allocated file data returned by basis_compress().
 	void basis_free_data(void* p);
 
+	// Runs a short benchmark using synthetic image data to time OpenCL encoding vs. CPU encoding, with multithreading enabled.
+	// Returns true if opencl is worth using on this system, otherwise false.
+	// If pOpenCL_failed is not null, it will be set to true if OpenCL encoding failed *on this particular machine/driver/BasisU version* and the encoder falled back to CPU encoding.
+	// basisu_encoder_init() MUST be called first. If OpenCL support wasn't enabled this always returns false.
+	bool basis_benchmark_etc1s_opencl(bool *pOpenCL_failed = nullptr);
+
 	// Parallel compression API
 	struct parallel_results
 	{