Merge pull request #220 from JCash/basisu-min-max

Replaced std::min/std::max with basisu::minimum/basisu::maximum
diff --git a/encoder/basisu_frontend.cpp b/encoder/basisu_frontend.cpp
index 0039db0..f7d47e9 100644
--- a/encoder/basisu_frontend.cpp
+++ b/encoder/basisu_frontend.cpp
@@ -295,7 +295,7 @@
 		optimize_selector_codebook();
 
 		if (m_params.m_debug_stats)
-			debug_printf("Total selector clusters: %u\n", (uint32_t)m_selector_cluster_indices.size());
+			debug_printf("Total selector clusters: %u\n", (uint32_t)m_selector_cluster_block_indices.size());
 
 		finalize();
 
@@ -318,7 +318,7 @@
 			return;
 
 		uint32_t total_blocks_relocated = 0;
-		const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_indices.size();
+		const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
 
 		bool_vec block_relocated_flags(m_total_blocks);
 
@@ -344,7 +344,7 @@
 
 			m_optimized_cluster_selectors.push_back(blk);
 			
-			vector_ensure_element_is_valid(m_selector_cluster_indices, new_selector_cluster_index);
+			vector_ensure_element_is_valid(m_selector_cluster_block_indices, new_selector_cluster_index);
 			
 			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
 			{
@@ -373,14 +373,14 @@
 				// Change the block to use the new cluster
 				m_block_selector_cluster_index[block_index] = new_selector_cluster_index;
 				
-				m_selector_cluster_indices[new_selector_cluster_index].push_back(block_index);
+				m_selector_cluster_block_indices[new_selector_cluster_index].push_back(block_index);
 
 				block_relocated_flags[block_index] = true;
 
 #if 0
-				int j = vector_find(m_selector_cluster_indices[old_selector_cluster_index], block_index);
+				int j = vector_find(m_selector_cluster_block_indices[old_selector_cluster_index], block_index);
 				if (j >= 0)
-					m_selector_cluster_indices[old_selector_cluster_index].erase(m_selector_cluster_indices[old_selector_cluster_index].begin() + j);
+					m_selector_cluster_block_indices[old_selector_cluster_index].erase(m_selector_cluster_block_indices[old_selector_cluster_index].begin() + j);
 #endif
 
 				total_blocks_relocated++;
@@ -397,7 +397,7 @@
 
 			for (int selector_cluster_index = 0; selector_cluster_index < (int)initial_selector_clusters; selector_cluster_index++)
 			{
-				uint_vec& block_indices = m_selector_cluster_indices[selector_cluster_index];
+				uint_vec& block_indices = m_selector_cluster_block_indices[selector_cluster_index];
 
 				uint32_t dst_ofs = 0;
 
@@ -415,6 +415,7 @@
 		debug_printf("Total blocks relocated to new flat selector clusters: %u\n", total_blocks_relocated);
 	}
 
+	// This method will change the number and ordering of the selector codebook clusters.
 	void basisu_frontend::optimize_selector_codebook()
 	{
 		debug_printf("optimize_selector_codebook\n");
@@ -452,6 +453,8 @@
 			new_to_old.push_back(i);
 		}
 
+		debug_printf("Original selector clusters: %u, new cluster selectors: %u\n", orig_total_selector_clusters, total_new_entries);
+
 		for (uint32_t i = 0; i < m_block_selector_cluster_index.size(); i++)
 		{
 			BASISU_FRONTEND_VERIFY((old_to_new[m_block_selector_cluster_index[i]] >= 0) && (old_to_new[m_block_selector_cluster_index[i]] < (int)total_new_entries));
@@ -460,7 +463,7 @@
 
 		basisu::vector<etc_block> new_optimized_cluster_selectors(m_optimized_cluster_selectors.size() ? total_new_entries : 0);
 		basist::etc1_global_selector_codebook_entry_id_vec new_optimized_cluster_selector_global_cb_ids(m_optimized_cluster_selector_global_cb_ids.size() ? total_new_entries : 0);
-		basisu::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_indices.size() ? total_new_entries : 0);
+		basisu::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_block_indices.size() ? total_new_entries : 0);
 		bool_vec new_selector_cluster_uses_global_cb(m_selector_cluster_uses_global_cb.size() ? total_new_entries : 0);
 
 		for (uint32_t i = 0; i < total_new_entries; i++)
@@ -471,18 +474,26 @@
 			if (m_optimized_cluster_selector_global_cb_ids.size())
 				new_optimized_cluster_selector_global_cb_ids[i] = m_optimized_cluster_selector_global_cb_ids[new_to_old[i]];
 
-			if (m_selector_cluster_indices.size())
-				new_selector_cluster_indices[i] = m_selector_cluster_indices[new_to_old[i]];
+			if (m_selector_cluster_block_indices.size())
+				new_selector_cluster_indices[i] = m_selector_cluster_block_indices[new_to_old[i]];
 
 			if (m_selector_cluster_uses_global_cb.size())
 				new_selector_cluster_uses_global_cb[i] = m_selector_cluster_uses_global_cb[new_to_old[i]];
 		}
-
+				
 		m_optimized_cluster_selectors.swap(new_optimized_cluster_selectors);
 		m_optimized_cluster_selector_global_cb_ids.swap(new_optimized_cluster_selector_global_cb_ids);
-		m_selector_cluster_indices.swap(new_selector_cluster_indices);
+		m_selector_cluster_block_indices.swap(new_selector_cluster_indices);
 		m_selector_cluster_uses_global_cb.swap(new_selector_cluster_uses_global_cb);
-				
+
+		// This isn't strictly necessary - doing it for completeness/future sanity.
+		if (m_selector_clusters_within_each_parent_cluster.size())
+		{
+			for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
+				for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)
+					m_selector_clusters_within_each_parent_cluster[i][j] = old_to_new[m_selector_clusters_within_each_parent_cluster[i][j]];
+		}
+								
 		debug_printf("optimize_selector_codebook: Before: %u After: %u\n", orig_total_selector_clusters, total_new_entries);
 	}
 
@@ -1422,9 +1433,9 @@
 	{
 		uint_vec block_selector_cluster_indices(m_total_blocks);
 
-		for (int cluster_index = 0; cluster_index < static_cast<int>(m_selector_cluster_indices.size()); cluster_index++)
+		for (int cluster_index = 0; cluster_index < static_cast<int>(m_selector_cluster_block_indices.size()); cluster_index++)
 		{
-			const basisu::vector<uint32_t>& cluster_indices = m_selector_cluster_indices[cluster_index];
+			const basisu::vector<uint32_t>& cluster_indices = m_selector_cluster_block_indices[cluster_index];
 
 			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
 			{
@@ -1437,7 +1448,7 @@
 		} // cluster_index
 
 		m_selector_clusters_within_each_parent_cluster.resize(0);
-		m_selector_clusters_within_each_parent_cluster.resize(m_selector_parent_cluster_indices.size());
+		m_selector_clusters_within_each_parent_cluster.resize(m_selector_parent_cluster_block_indices.size());
 
 		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
 		{
@@ -1527,19 +1538,19 @@
 
 		bool status = generate_hierarchical_codebook_threaded(selector_clusterizer,
 			m_params.m_max_selector_clusters, m_use_hierarchical_selector_codebooks ? parent_codebook_size : 0,
-			m_selector_cluster_indices,
-			m_selector_parent_cluster_indices,
+			m_selector_cluster_block_indices,
+			m_selector_parent_cluster_block_indices,
 			max_threads, m_params.m_pJob_pool);
 		BASISU_FRONTEND_VERIFY(status);
 
 		if (m_use_hierarchical_selector_codebooks)
 		{
-			if (!m_selector_parent_cluster_indices.size())
+			if (!m_selector_parent_cluster_block_indices.size())
 			{
-				m_selector_parent_cluster_indices.resize(0);
-				m_selector_parent_cluster_indices.resize(1);
+				m_selector_parent_cluster_block_indices.resize(0);
+				m_selector_parent_cluster_block_indices.resize(1);
 				for (uint32_t i = 0; i < m_total_blocks; i++)
-					m_selector_parent_cluster_indices[0].push_back(i);
+					m_selector_parent_cluster_block_indices[0].push_back(i);
 			}
 
 			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 <= UINT8_MAX);
@@ -1549,9 +1560,9 @@
 			m_block_parent_selector_cluster.resize(m_total_blocks);
 			vector_set_all(m_block_parent_selector_cluster, 0xFF);
 
-			for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_selector_parent_cluster_indices.size(); parent_cluster_index++)
+			for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_selector_parent_cluster_block_indices.size(); parent_cluster_index++)
 			{
-				const uint_vec &cluster = m_selector_parent_cluster_indices[parent_cluster_index];
+				const uint_vec &cluster = m_selector_parent_cluster_block_indices[parent_cluster_index];
 				for (uint32_t j = 0; j < cluster.size(); j++)
 					m_block_parent_selector_cluster[cluster[j]] = static_cast<uint8_t>(parent_cluster_index);
 			}
@@ -1561,9 +1572,9 @@
 			}
 
 			// Ensure that all the blocks within each cluster are all in the same parent cluster, or something is very wrong.
-			for (uint32_t cluster_index = 0; cluster_index < m_selector_cluster_indices.size(); cluster_index++)
+			for (uint32_t cluster_index = 0; cluster_index < m_selector_cluster_block_indices.size(); cluster_index++)
 			{
-				const uint_vec &cluster = m_selector_cluster_indices[cluster_index];
+				const uint_vec &cluster = m_selector_cluster_block_indices[cluster_index];
 			
 				uint32_t parent_cluster_index = 0;
 				for (uint32_t j = 0; j < cluster.size(); j++)
@@ -1581,14 +1592,16 @@
 			}
 		}
 
-		debug_printf("Total selector clusters: %u, total parent selector clusters: %u\n", (uint32_t)m_selector_cluster_indices.size(), (uint32_t)m_selector_parent_cluster_indices.size());
+		debug_printf("Total selector clusters: %u, total parent selector clusters: %u\n", (uint32_t)m_selector_cluster_block_indices.size(), (uint32_t)m_selector_parent_cluster_block_indices.size());
 	}
 
 	void basisu_frontend::create_optimized_selector_codebook(uint32_t iter)
 	{
 		debug_printf("create_optimized_selector_codebook\n");
 
-		const uint32_t total_selector_clusters = (uint32_t)m_selector_cluster_indices.size();
+		const uint32_t total_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
+
+		debug_printf("Total selector clusters (from m_selector_cluster_block_indices.size()): %u\n", (uint32_t)m_selector_cluster_block_indices.size());
 
 		m_optimized_cluster_selectors.resize(total_selector_clusters);
 
@@ -1610,7 +1623,7 @@
 					
 					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 					{
-						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
+						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
 
 						if (!cluster_block_indices.size())
 							continue;
@@ -1694,7 +1707,7 @@
 					
 					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
 					{
-						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[cluster_index];
+						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
 
 						if (!cluster_block_indices.size())
 							continue;
@@ -1815,22 +1828,22 @@
 #endif
 
 		} // if (m_params.m_pGlobal_sel_codebook)
-
+				
 		if (m_params.m_debug_images)
 		{
 			uint32_t max_selector_cluster_size = 0;
 
-			for (uint32_t i = 0; i < m_selector_cluster_indices.size(); i++)
-				max_selector_cluster_size = maximum<uint32_t>(max_selector_cluster_size, (uint32_t)m_selector_cluster_indices[i].size());
+			for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+				max_selector_cluster_size = maximum<uint32_t>(max_selector_cluster_size, (uint32_t)m_selector_cluster_block_indices[i].size());
 
 			if ((max_selector_cluster_size * 5) < 32768)
 			{
 				const uint32_t x_spacer_len = 16;
-				image selector_cluster_vis(x_spacer_len + max_selector_cluster_size * 5, (uint32_t)m_selector_cluster_indices.size() * 5);
+				image selector_cluster_vis(x_spacer_len + max_selector_cluster_size * 5, (uint32_t)m_selector_cluster_block_indices.size() * 5);
 
-				for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_indices.size(); selector_cluster_index++)
+				for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_block_indices.size(); selector_cluster_index++)
 				{
-					const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_indices[selector_cluster_index];
+					const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[selector_cluster_index];
 
 					for (uint32_t y = 0; y < 4; y++)
 						for (uint32_t x = 0; x < 4; x++)
@@ -1858,22 +1871,33 @@
 	void basisu_frontend::find_optimal_selector_clusters_for_each_block()
 	{
 		debug_printf("find_optimal_selector_clusters_for_each_block\n");
-				
+
+		// Sanity checks
+		BASISU_FRONTEND_VERIFY(m_selector_cluster_block_indices.size() == m_optimized_cluster_selectors.size());
+		for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
+		{
+			for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)
+			{
+				BASISU_FRONTEND_VERIFY(m_selector_clusters_within_each_parent_cluster[i][j] < m_optimized_cluster_selectors.size());
+			}
+		}
+
 		m_block_selector_cluster_index.resize(m_total_blocks);
-				
+							
 		if (m_params.m_compression_level == 0)
 		{
 			// Don't do anything, just leave the blocks in their original selector clusters.
-			for (uint32_t i = 0; i < m_selector_cluster_indices.size(); i++)
+			for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
 			{
-				for (uint32_t j = 0; j < m_selector_cluster_indices[i].size(); j++)
-					m_block_selector_cluster_index[m_selector_cluster_indices[i][j]] = i;
+				for (uint32_t j = 0; j < m_selector_cluster_block_indices[i].size(); j++)
+					m_block_selector_cluster_index[m_selector_cluster_block_indices[i][j]] = i;
 			}
 		}
 		else
 		{
-			basisu::vector< basisu::vector<uint32_t> > new_cluster_indices;
-
+			// Note that this method may leave some empty clusters (i.e. arrays with no block indices), including at the end.
+			basisu::vector< basisu::vector<uint32_t> > new_cluster_indices(m_optimized_cluster_selectors.size());
+						
 			// For each block: Determine which quantized selectors best encode that block, given its quantized endpoints.
 
 			basisu::vector<uint8_t> unpacked_optimized_cluster_selectors(16 * m_optimized_cluster_selectors.size());
@@ -1887,7 +1911,7 @@
 					}
 				}
 			}
-
+						
 			const uint32_t N = 1024;
 			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
 			{
@@ -1913,7 +1937,7 @@
 					const uint32_t parent_selector_cluster = m_block_parent_selector_cluster.size() ? m_block_parent_selector_cluster[block_index] : 0;
 					const uint_vec *pCluster_indices = m_selector_clusters_within_each_parent_cluster.size() ? &m_selector_clusters_within_each_parent_cluster[parent_selector_cluster] : nullptr;
 
-					const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_indices.size();
+					const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_block_indices.size();
 
 #if 0
 					for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
@@ -2033,12 +2057,12 @@
 #ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
 #endif
-			
-			m_selector_cluster_indices.swap(new_cluster_indices);
+
+			m_selector_cluster_block_indices.swap(new_cluster_indices);
 		}
 
-		for (uint32_t i = 0; i < m_selector_cluster_indices.size(); i++)
-			vector_sort(m_selector_cluster_indices[i]);
+		for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+			vector_sort(m_selector_cluster_block_indices[i]);
 	}
 
 	// TODO: Remove old ETC1 specific stuff, and thread this.
@@ -2226,7 +2250,7 @@
 
 		if (m_params.m_debug_stats)
 			debug_printf("Total subblock endpoints refined: %u (%3.1f%%)\n", total_subblocks_refined, total_subblocks_refined * 100.0f / total_subblocks_examined);
-
+				
 		return total_subblocks_refined;
 	}
 
diff --git a/encoder/basisu_frontend.h b/encoder/basisu_frontend.h
index d5a4088..1831d08 100644
--- a/encoder/basisu_frontend.h
+++ b/encoder/basisu_frontend.h
@@ -144,7 +144,7 @@
 		bool get_endpoint_cluster_color_is_used(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_color_used[individual_mode]; }
 
 		// Selector clusters
-		uint32_t get_total_selector_clusters() const { return static_cast<uint32_t>(m_selector_cluster_indices.size()); }
+		uint32_t get_total_selector_clusters() const { return static_cast<uint32_t>(m_selector_cluster_block_indices.size()); }
 		uint32_t get_block_selector_cluster_index(uint32_t block_index) const { return m_block_selector_cluster_index[block_index]; }
 		const etc_block &get_selector_cluster_selector_bits(uint32_t cluster_index) const { return m_optimized_cluster_selectors[cluster_index]; }
 
@@ -152,7 +152,7 @@
 		const bool_vec &get_selector_cluster_uses_global_cb_vec() const { return m_selector_cluster_uses_global_cb; }
 
 		// Returns block indices using each selector cluster
-		const uint_vec &get_selector_cluster_block_indices(uint32_t selector_cluster_index) const { return m_selector_cluster_indices[selector_cluster_index]; }
+		const uint_vec &get_selector_cluster_block_indices(uint32_t selector_cluster_index) const { return m_selector_cluster_block_indices[selector_cluster_index]; }
 
 		void dump_debug_image(const char *pFilename, uint32_t first_block, uint32_t num_blocks_x, uint32_t num_blocks_y, bool output_blocks);
 		
@@ -279,13 +279,13 @@
 				
 		// The block(s) within each selector cluster
 		// Note: If you add anything here that uses selector cluster indicies, be sure to update optimize_selector_codebook()!
-		basisu::vector<uint_vec> m_selector_cluster_indices;
+		basisu::vector<uint_vec> m_selector_cluster_block_indices;
 
 		// The selector bits for each selector cluster.
 		basisu::vector<etc_block> m_optimized_cluster_selectors;
 
 		// The block(s) within each parent selector cluster.
-		basisu::vector<uint_vec> m_selector_parent_cluster_indices;
+		basisu::vector<uint_vec> m_selector_parent_cluster_block_indices;
 		
 		// Each block's parent selector cluster
 		uint8_vec m_block_parent_selector_cluster;
diff --git a/encoder/basisu_miniz.h b/encoder/basisu_miniz.h
index 4e59c4e..919e08c 100644
--- a/encoder/basisu_miniz.h
+++ b/encoder/basisu_miniz.h
@@ -594,15 +594,15 @@
 
 #ifndef MINIZ_HEADER_FILE_ONLY
 
+#include <string.h>
+#include <assert.h>
+
 namespace buminiz {
 
 typedef unsigned char mz_validate_uint16[sizeof(mz_uint16)==2 ? 1 : -1];
 typedef unsigned char mz_validate_uint32[sizeof(mz_uint32)==4 ? 1 : -1];
 typedef unsigned char mz_validate_uint64[sizeof(mz_uint64)==8 ? 1 : -1];
 
-#include <string.h>
-#include <assert.h>
-
 #define MZ_ASSERT(x) assert(x)
 
 #ifdef MINIZ_NO_MALLOC
diff --git a/transcoder/basisu_containers.h b/transcoder/basisu_containers.h
index 14a02e8..acc3a95 100644
--- a/transcoder/basisu_containers.h
+++ b/transcoder/basisu_containers.h
@@ -1018,6 +1018,11 @@
    template<typename Key, typename Value = empty_type, typename Hasher = hasher<Key>, typename Equals = equal_to<Key> >
    class hash_map
    {
+   public:
+      class iterator;
+      class const_iterator;
+   
+   private:
       friend class iterator;
       friend class const_iterator;
 
@@ -1204,9 +1209,7 @@
          if (new_hash_size > m_values.size())
             rehash((uint32_t)new_hash_size);
       }
-
-      class const_iterator;
-
+            
       class iterator
       {
          friend class hash_map<Key, Value, Hasher, Equals>;
diff --git a/transcoder/basisu_transcoder.cpp b/transcoder/basisu_transcoder.cpp
index c99dda4..17c893e 100644
--- a/transcoder/basisu_transcoder.cpp
+++ b/transcoder/basisu_transcoder.cpp
@@ -30,8 +30,10 @@
 	#ifdef __EMSCRIPTEN__
 		// Can't use unaligned loads/stores with WebAssembly.
 		#define BASISD_USE_UNALIGNED_WORD_READS (0)
-	#else
+	#elif defined(_M_AMD64) || defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)
 		#define BASISD_USE_UNALIGNED_WORD_READS (1)
+	#else
+		#define BASISD_USE_UNALIGNED_WORD_READS (0)
 	#endif
 #endif