encoder/basisu_astc_hdr_common.h - external/github.com/BinomialLLC/basis_universal - Git at Google

 // File: basisu_astc_hdr_common.h
 #pragma once
 #include "basisu_enc.h"
 #include "basisu_gpu_texture.h"
 #include "../transcoder/basisu_astc_helpers.h"
 #include "../transcoder/basisu_astc_hdr_core.h"

 namespace basisu
 {
 	const uint32_t MAX_ASTC_HDR_BLOCK_W = 6, MAX_ASTC_HDR_BLOCK_H = 6;
 	const uint32_t MAX_ASTC_HDR_ENC_BLOCK_PIXELS = 6 * 6;

 	const uint32_t MODE11_TOTAL_SUBMODES = 8; // plus an extra hidden submode, directly encoded, for direct, so really 9 (see tables 99/100 of the ASTC spec)
 	const uint32_t MODE7_TOTAL_SUBMODES = 6;

 	// [ise_range][0] = # levels
 	// [ise_range][1...] = lerp value [0,64]
 	// in ASTC order
 	// Supported ISE weight ranges: 0 to 11, 12 total
 	const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = astc_helpers::BISE_2_LEVELS; // ISE 0=2 levels
 	const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = astc_helpers::BISE_32_LEVELS; // ISE 11=32 levels
 	const uint32_t MIN_SUPPORTED_WEIGHT_LEVELS = 2;
 	const uint32_t MAX_SUPPORTED_WEIGHT_LEVELS = 32;

 	extern const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][33];

 	const float Q_LOG_BIAS_4x4 = .125f; // the original UASTC HDR 4x4 log bias
 	const float Q_LOG_BIAS_6x6 = 1.0f; // the log bias both encoders use now

 	const float LDR_TO_HDR_NITS = 100.0f;

 	extern vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS];
 	extern uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; // [ise_range][astc_index] -> linear index
 	extern uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; // [ise_range][linear_index] -> astc_index

 	struct astc_hdr_codec_base_options
 	{
 		float m_r_err_scale, m_g_err_scale;
 		float m_q_log_bias;

 		bool m_ultra_quant;

 		// If true, the ASTC HDR compressor is allowed to more aggressively vary weight indices for slightly higher compression in non-fastest mode. This will hurt BC6H quality, however.
 		bool m_allow_uber_mode;

 		bool m_mode7_full_s_optimization;

 		bool m_take_first_non_clamping_mode11_submode;
 		bool m_take_first_non_clamping_mode7_submode;

 		bool m_disable_weight_plane_optimization;

 		astc_hdr_codec_base_options() { init(); }

 		void init();
 	};

 	inline int get_bit(
 		int src_val, int src_bit)
 	{
 		assert(src_bit >= 0 && src_bit <= 31);
 		int bit = (src_val >> src_bit) & 1;
 		return bit;
 	}

 	inline void pack_bit(
 		int& dst, int dst_bit,
 		int src_val, int src_bit = 0)
 	{
 		assert(dst_bit >= 0 && dst_bit <= 31);
 		int bit = get_bit(src_val, src_bit);
 		dst |= (bit << dst_bit);
 	}

 	inline uint32_t get_max_qlog(uint32_t bits)
 	{
 		switch (bits)
 		{
 		case 7: return basist::MAX_QLOG7;
 		case 8: return basist::MAX_QLOG8;
 		case 9: return basist::MAX_QLOG9;
 		case 10: return basist::MAX_QLOG10;
 		case 11: return basist::MAX_QLOG11;
 		case 12: return basist::MAX_QLOG12;
 		case 16: return basist::MAX_QLOG16;
 		default: assert(0); break;
 		}
 		return 0;
 	}

 #if 0
 	inline float get_max_qlog_val(uint32_t bits)
 	{
 		switch (bits)
 		{
 		case 7: return MAX_QLOG7_VAL;
 		case 8: return MAX_QLOG8_VAL;
 		case 9: return MAX_QLOG9_VAL;
 		case 10: return MAX_QLOG10_VAL;
 		case 11: return MAX_QLOG11_VAL;
 		case 12: return MAX_QLOG12_VAL;
 		case 16: return MAX_QLOG16_VAL;
 		default: assert(0); break;
 		}
 		return 0;
 	}
 #endif

 #if 0
 	// Input is the low 11 bits of the qlog
 	// Returns the 10-bit mantissa of the half float value
 	int qlog11_to_half_float_mantissa(int M)
 	{
 		assert(M <= 0x7FF);
 		int Mt;
 		if (M < 512)
 			Mt = 3 * M;
 		else if (M >= 1536)
 			Mt = 5 * M - 2048;
 		else
 			Mt = 4 * M - 512;
 		return (Mt >> 3);
 	}
 #endif

 	// Input is the 10-bit mantissa of the half float value
 	// Output is the 11-bit qlog value
 	// Inverse of qlog11_to_half_float_mantissa()
 	inline int half_float_mantissa_to_qlog11(int hf)
 	{
 		int q0 = (hf * 8 + 2) / 3;
 		int q1 = (hf * 8 + 2048 + 4) / 5;

 		if (q0 < 512)
 			return q0;
 		else if (q1 >= 1536)
 			return q1;

 		int q2 = (hf * 8 + 512 + 2) / 4;
 		return q2;
 	}

 	inline int half_to_qlog16(int hf)
 	{
 		assert(!basist::half_is_signed((basist::half_float)hf) && !basist::is_half_inf_or_nan((basist::half_float)hf));

 		// extract 5 bits exponent, which is carried through to qlog16 unchanged
 		const int exp = (hf >> 10) & 0x1F;

 		// extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless)
 		const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF);
 		assert(mantissa <= 0x7FF);

 		// Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights.
 		uint32_t qlog16 = (exp << 11) | mantissa;

 		// should be a lossless operation
 		assert(astc_helpers::qlog16_to_half(qlog16) == hf);

 		return qlog16;
 	}

 	void interpolate_qlog12_colors(
 		const int e[2][3],
 		basist::half_float* pDecoded_half,
 		vec3F* pDecoded_float,
 		uint32_t n, uint32_t ise_weight_range);

 	bool get_astc_hdr_mode_11_block_colors(
 		const uint8_t* pEndpoints,
 		basist::half_float* pDecoded_half,
 		vec3F* pDecoded_float,
 		uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);

 	bool get_astc_hdr_mode_7_block_colors(
 		const uint8_t* pEndpoints,
 		basist::half_float* pDecoded_half,
 		vec3F* pDecoded_float,
 		uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);

 	// Fast high precision piecewise linear approximation of log2(bias+x).
 	// Half may be zero, positive or denormal. No NaN/Inf/negative.
 	BASISU_FORCE_INLINE double q(basist::half_float x, float log_bias)
 	{
 		union { float f; int32_t i; uint32_t u; } fi;

 		fi.f = fast_half_to_float_pos_not_inf_or_nan(x);

 		assert(fi.f >= 0.0f);

 		fi.f += log_bias;

 		return (double)fi.u; // approx log2f(fi.f), need to return double for the precision
 	}

 	BASISU_FORCE_INLINE uint32_t q2(basist::half_float x, float log_bias)
 	{
 		union { float f; int32_t i; uint32_t u; } fi;

 		fi.f = fast_half_to_float_pos_not_inf_or_nan(x);

 		assert(fi.f >= 0.0f);

 		fi.f += log_bias;

 		return fi.u;
 	}

 	double eval_selectors(
 		uint32_t num_pixels,
 		uint8_t* pWeights,
 		uint32_t ise_weight_range,
 		const basist::half_float* pBlock_pixels_half,
 		uint32_t num_weight_levels,
 		const basist::half_float* pDecoded_half,
 		const astc_hdr_codec_base_options& coptions,
 		uint32_t usable_selector_bitmask = UINT32_MAX);

 	double eval_selectors_dual_plane(
 		uint32_t channel_index,
 		uint32_t num_pixels,
 		uint8_t* pWeights0, uint8_t* pWeights1,
 		const basist::half_float* pBlock_pixels_half,
 		uint32_t num_weight_levels,
 		const basist::half_float* pDecoded_half,
 		const astc_hdr_codec_base_options& coptions,
 		uint32_t usable_selector_bitmask = UINT32_MAX);

 	double compute_block_error(uint32_t num_pixels, const basist::half_float* pOrig_block, const basist::half_float* pPacked_block, const astc_hdr_codec_base_options& coptions);

 	const uint32_t FIRST_MODE7_SUBMODE_INDEX = 0;
 	const uint32_t MAX_MODE7_SUBMODE_INDEX = 5;

 	bool pack_mode7(
 		const vec3F& high_color_q16, const float s_q16,
 		uint32_t ise_endpoint_range, uint8_t* pEndpoints,
 		uint32_t ise_weight_range, // only used for determining biasing during CEM 7 packing
 		const astc_hdr_codec_base_options& coptions,
 		int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used);

 	bool try_mode7(
 		uint32_t num_pixels,
 		uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
 		const vec3F& high_color_q16, const float s_q16,
 		const basist::half_float block_pixels_half[][3],
 		uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions,
 		uint32_t ise_endpoint_range,
 		int32_t first_submode = 0, int32_t last_submode = MAX_MODE7_SUBMODE_INDEX);

 	bool pack_mode11(
 		const vec3F& low_color_q16, const vec3F& high_color_q16,
 		uint32_t ise_endpoint_range, uint8_t* pEndpoints,
 		const astc_hdr_codec_base_options& coptions,
 		bool direct_only, int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used);

 	bool try_mode11(uint32_t num_pixels,
 		uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
 		const vec3F& low_color_q16, const vec3F& high_color_q16,
 		const basist::half_float block_pixels_half[][3],
 		uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
 		bool constrain_ise_weight_selectors,
 		int32_t first_submode, int32_t last_submode, bool ignore_clamping);

 	bool try_mode11_dual_plane(uint32_t channel_index, uint32_t num_pixels,
 		uint8_t* pEndpoints, uint8_t* pWeights0, uint8_t* pWeights1, double& cur_block_error, uint32_t& submode_used,
 		const vec3F& low_color_q16, const vec3F& high_color_q16,
 		const basist::half_float block_pixels_half[][3],
 		uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
 		bool constrain_ise_weight_selectors,
 		int32_t first_submode, int32_t last_submode, bool ignore_clamping);

 	const int FIRST_MODE11_SUBMODE_INDEX = -1;
 	const int MAX_MODE11_SUBMODE_INDEX = 7;

 	enum opt_mode_t
 	{
 		cNoOpt,
 		cOrdinaryLeastSquares,
 		cWeightedLeastSquares,
 		cWeightedLeastSquaresHeavy,
 		cWeightedAverage
 	};

 	struct encode_astc_block_stats
 	{
 		uint32_t m_num_pixels;
 		vec3F m_mean_q16;
 		vec3F m_axis_q16;

 		void init(uint32_t num_pixels, const vec4F pBlock_pixels_q16[]);
 	};

 	double encode_astc_hdr_block_mode_11(
 		uint32_t num_pixels,
 		const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
 		uint32_t ise_weight_range,
 		uint32_t& best_submode,
 		double cur_block_error,
 		uint8_t* blk_endpoints, uint8_t* blk_weights,
 		const astc_hdr_codec_base_options& coptions,
 		bool direct_only,
 		uint32_t ise_endpoint_range,
 		bool uber_mode,
 		bool constrain_ise_weight_selectors,
 		int32_t first_submode, int32_t last_submode, bool ignore_clamping,
 		opt_mode_t opt_mode,
 		const encode_astc_block_stats *pBlock_stats = nullptr);

 	double encode_astc_hdr_block_downsampled_mode_11(
 		uint32_t block_x, uint32_t block_y, uint32_t grid_x, uint32_t grid_y,
 		uint32_t ise_weight_range, uint32_t ise_endpoint_range,
 		uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
 		double cur_block_error,
 		int32_t first_submode, int32_t last_submode, bool ignore_clamping, opt_mode_t opt_mode,
 		uint8_t* pBlk_endpoints, uint8_t* pBlk_weights, uint32_t& best_submode,
 		const astc_hdr_codec_base_options& coptions,
 		const encode_astc_block_stats* pBlock_stats = nullptr);

 	double encode_astc_hdr_block_mode_11_dual_plane(
 		uint32_t num_pixels,
 		const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
 		uint32_t channel_index,		// 0-2
 		uint32_t ise_weight_range,
 		uint32_t& best_submode,
 		double cur_block_error,
 		uint8_t* blk_endpoints, uint8_t* blk_weights0, uint8_t* blk_weights1,
 		const astc_hdr_codec_base_options& coptions,
 		bool direct_only,
 		uint32_t ise_endpoint_range,
 		bool uber_mode,
 		bool constrain_ise_weight_selectors,
 		int32_t first_submode, int32_t last_submode,
 		bool ignore_clamping);

 	double encode_astc_hdr_block_mode_7(
 		uint32_t num_pixels,
 		const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
 		uint32_t ise_weight_range,
 		uint32_t& best_submode,
 		double cur_block_error,
 		uint8_t* blk_endpoints,  //[4]
 		uint8_t* blk_weights, // [num_pixels]
 		const astc_hdr_codec_base_options& coptions,
 		uint32_t ise_endpoint_range,
 		int first_submode = 0, int last_submode = MAX_MODE7_SUBMODE_INDEX,
 		const encode_astc_block_stats *pBlock_stats = nullptr);

 	//--------------------------------------------------------------------------------------------------------------------------

 	struct mode11_log_desc
 	{
 		int32_t m_submode;
 		int32_t m_maj_comp;

 		// Or R0, G0, B0 if maj_comp==3 (direct)
 		int32_t m_a;  // positive
 		int32_t m_c;  // positive
 		int32_t m_b0; // positive

 		// Or R1, G1, B1 if maj_comp==3 (direct)
 		int32_t m_b1; // positive
 		int32_t m_d0; // if not direct, is signed
 		int32_t m_d1; // if not direct, is signed

 		// limits if not direct
 		int32_t m_a_bits, m_c_bits, m_b_bits, m_d_bits;
 		int32_t m_max_a_val, m_max_c_val, m_max_b_val, m_min_d_val, m_max_d_val;

 		void clear() { clear_obj(*this); }

 		bool is_direct() const { return m_maj_comp == 3; }
 	};

 	//--------------------------------------------------------------------------------------------------------------------------
 	bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range, bool early_out_if_clamped, int max_clamp_mag_accept_thresh);

 	bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, int val_q[2][3], int& max_clamp_mag, bool early_out_if_clamped = false, int max_clamp_mag_accept_thresh = 0);
 	bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag, bool early_out_if_clamped = false, int max_clamp_mag_accept_thresh = 0);
 	void pack_astc_mode11_direct(uint8_t* pEndpoints, vec3F l_q16, vec3F h_q16);

 	bool pack_mode11(mode11_log_desc& desc, uint8_t* pEndpoints);
 	void unpack_mode11(const uint8_t* pEndpoints, mode11_log_desc& desc);

 	void decode_cem_11_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index);
 	void decode_cem_7_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index);

 	void dequantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights);

 	const float* get_6x6_downsample_matrix(uint32_t grid_width, uint32_t grid_height);
 	const float* get_8x6_downsample_matrix(uint32_t grid_width, uint32_t grid_height);

 	void compute_upsample_matrix(basisu::vector2D<float>& upsample_matrix, uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height);
 	void compute_upsample_matrix_transposed(basisu::vector<float>& unweighted_downsample_matrix, uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height);

 	void compute_diag_AtA_vector(uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height, const vector2D<float>& upsample_matrix, float* pDst_vec);

 	void downsample_weight_grid(
 		const float* pMatrix_weights,
 		uint32_t bx, uint32_t by,		// source/from dimension (block size)
 		uint32_t wx, uint32_t wy,		// dest/to dimension (grid size)
 		const uint8_t* pSrc_weights,	// these are dequantized weights, NOT ISE symbols, [by][bx]
 		uint8_t* pDst_weights);			// [wy][wx]

 	void downsample_ise_weights(
 		uint32_t weight_ise_range, uint32_t quant_weight_ise_range,
 		uint32_t block_w, uint32_t block_h,
 		uint32_t grid_w, uint32_t grid_h,
 		const uint8_t* pSrc_weights, uint8_t* pDst_weights);

 	void downsample_ise_weights_dual_plane(
 		uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range,
 		uint32_t block_w, uint32_t block_h,
 		uint32_t grid_w, uint32_t grid_h,
 		const uint8_t* pSrc_weights0, const uint8_t* pSrc_weights1,
 		uint8_t* pDst_weights);

 	bool refine_endpoints(
 		uint32_t cem,
 		uint32_t endpoint_ise_range,
 		uint8_t* pEndpoint_vals, // the endpoints to optimize
 		uint32_t block_w, uint32_t block_h, // block dimensions
 		uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid
 		uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
 		const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets
 		astc_hdr_codec_base_options& coptions, opt_mode_t opt_mode);

 	extern bool g_astc_hdr_enc_initialized;

 	// This MUST be called before encoding any blocks.
 	void astc_hdr_enc_init();

 } // namespace basisu
	// File: basisu_astc_hdr_common.h
	#pragma once
	#include "basisu_enc.h"
	#include "basisu_gpu_texture.h"
	#include "../transcoder/basisu_astc_helpers.h"
	#include "../transcoder/basisu_astc_hdr_core.h"

	namespace basisu
	{
	const uint32_t MAX_ASTC_HDR_BLOCK_W = 6, MAX_ASTC_HDR_BLOCK_H = 6;
	const uint32_t MAX_ASTC_HDR_ENC_BLOCK_PIXELS = 6 * 6;

	const uint32_t MODE11_TOTAL_SUBMODES = 8; // plus an extra hidden submode, directly encoded, for direct, so really 9 (see tables 99/100 of the ASTC spec)
	const uint32_t MODE7_TOTAL_SUBMODES = 6;

	// [ise_range][0] = # levels
	// [ise_range][1...] = lerp value [0,64]
	// in ASTC order
	// Supported ISE weight ranges: 0 to 11, 12 total
	const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = astc_helpers::BISE_2_LEVELS; // ISE 0=2 levels
	const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = astc_helpers::BISE_32_LEVELS; // ISE 11=32 levels
	const uint32_t MIN_SUPPORTED_WEIGHT_LEVELS = 2;
	const uint32_t MAX_SUPPORTED_WEIGHT_LEVELS = 32;

	extern const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][33];

	const float Q_LOG_BIAS_4x4 = .125f; // the original UASTC HDR 4x4 log bias
	const float Q_LOG_BIAS_6x6 = 1.0f; // the log bias both encoders use now

	const float LDR_TO_HDR_NITS = 100.0f;

	extern vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS];
	extern uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; // [ise_range][astc_index] -> linear index
	extern uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; // [ise_range][linear_index] -> astc_index

	struct astc_hdr_codec_base_options
	{
	float m_r_err_scale, m_g_err_scale;
	float m_q_log_bias;

	bool m_ultra_quant;

	// If true, the ASTC HDR compressor is allowed to more aggressively vary weight indices for slightly higher compression in non-fastest mode. This will hurt BC6H quality, however.
	bool m_allow_uber_mode;

	bool m_mode7_full_s_optimization;

	bool m_take_first_non_clamping_mode11_submode;
	bool m_take_first_non_clamping_mode7_submode;

	bool m_disable_weight_plane_optimization;

	astc_hdr_codec_base_options() { init(); }

	void init();
	};

	inline int get_bit(
	int src_val, int src_bit)
	{
	assert(src_bit >= 0 && src_bit <= 31);
	int bit = (src_val >> src_bit) & 1;
	return bit;
	}

	inline void pack_bit(
	int& dst, int dst_bit,
	int src_val, int src_bit = 0)
	{
	assert(dst_bit >= 0 && dst_bit <= 31);
	int bit = get_bit(src_val, src_bit);
	dst \|= (bit << dst_bit);
	}

	inline uint32_t get_max_qlog(uint32_t bits)
	{
	switch (bits)
	{
	case 7: return basist::MAX_QLOG7;
	case 8: return basist::MAX_QLOG8;
	case 9: return basist::MAX_QLOG9;
	case 10: return basist::MAX_QLOG10;
	case 11: return basist::MAX_QLOG11;
	case 12: return basist::MAX_QLOG12;
	case 16: return basist::MAX_QLOG16;
	default: assert(0); break;
	}
	return 0;
	}

	#if 0
	inline float get_max_qlog_val(uint32_t bits)
	{
	switch (bits)
	{
	case 7: return MAX_QLOG7_VAL;
	case 8: return MAX_QLOG8_VAL;
	case 9: return MAX_QLOG9_VAL;
	case 10: return MAX_QLOG10_VAL;
	case 11: return MAX_QLOG11_VAL;
	case 12: return MAX_QLOG12_VAL;
	case 16: return MAX_QLOG16_VAL;
	default: assert(0); break;
	}
	return 0;
	}
	#endif

	#if 0
	// Input is the low 11 bits of the qlog
	// Returns the 10-bit mantissa of the half float value
	int qlog11_to_half_float_mantissa(int M)
	{
	assert(M <= 0x7FF);
	int Mt;
	if (M < 512)
	Mt = 3 * M;
	else if (M >= 1536)
	Mt = 5 * M - 2048;
	else
	Mt = 4 * M - 512;
	return (Mt >> 3);
	}
	#endif

	// Input is the 10-bit mantissa of the half float value
	// Output is the 11-bit qlog value
	// Inverse of qlog11_to_half_float_mantissa()
	inline int half_float_mantissa_to_qlog11(int hf)
	{
	int q0 = (hf * 8 + 2) / 3;
	int q1 = (hf * 8 + 2048 + 4) / 5;

	if (q0 < 512)
	return q0;
	else if (q1 >= 1536)
	return q1;

	int q2 = (hf * 8 + 512 + 2) / 4;
	return q2;
	}

	inline int half_to_qlog16(int hf)
	{
	assert(!basist::half_is_signed((basist::half_float)hf) && !basist::is_half_inf_or_nan((basist::half_float)hf));

	// extract 5 bits exponent, which is carried through to qlog16 unchanged
	const int exp = (hf >> 10) & 0x1F;

	// extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless)
	const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF);
	assert(mantissa <= 0x7FF);

	// Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights.
	uint32_t qlog16 = (exp << 11) \| mantissa;

	// should be a lossless operation
	assert(astc_helpers::qlog16_to_half(qlog16) == hf);

	return qlog16;
	}

	void interpolate_qlog12_colors(
	const int e[2][3],
	basist::half_float* pDecoded_half,
	vec3F* pDecoded_float,
	uint32_t n, uint32_t ise_weight_range);

	bool get_astc_hdr_mode_11_block_colors(
	const uint8_t* pEndpoints,
	basist::half_float* pDecoded_half,
	vec3F* pDecoded_float,
	uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);

	bool get_astc_hdr_mode_7_block_colors(
	const uint8_t* pEndpoints,
	basist::half_float* pDecoded_half,
	vec3F* pDecoded_float,
	uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);

	// Fast high precision piecewise linear approximation of log2(bias+x).
	// Half may be zero, positive or denormal. No NaN/Inf/negative.
	BASISU_FORCE_INLINE double q(basist::half_float x, float log_bias)
	{
	union { float f; int32_t i; uint32_t u; } fi;

	fi.f = fast_half_to_float_pos_not_inf_or_nan(x);

	assert(fi.f >= 0.0f);

	fi.f += log_bias;

	return (double)fi.u; // approx log2f(fi.f), need to return double for the precision
	}

	BASISU_FORCE_INLINE uint32_t q2(basist::half_float x, float log_bias)
	{
	union { float f; int32_t i; uint32_t u; } fi;

	fi.f = fast_half_to_float_pos_not_inf_or_nan(x);

	assert(fi.f >= 0.0f);

	fi.f += log_bias;

	return fi.u;
	}

	double eval_selectors(
	uint32_t num_pixels,
	uint8_t* pWeights,
	uint32_t ise_weight_range,
	const basist::half_float* pBlock_pixels_half,
	uint32_t num_weight_levels,
	const basist::half_float* pDecoded_half,
	const astc_hdr_codec_base_options& coptions,
	uint32_t usable_selector_bitmask = UINT32_MAX);

	double eval_selectors_dual_plane(
	uint32_t channel_index,
	uint32_t num_pixels,
	uint8_t* pWeights0, uint8_t* pWeights1,
	const basist::half_float* pBlock_pixels_half,
	uint32_t num_weight_levels,
	const basist::half_float* pDecoded_half,
	const astc_hdr_codec_base_options& coptions,
	uint32_t usable_selector_bitmask = UINT32_MAX);

	double compute_block_error(uint32_t num_pixels, const basist::half_float* pOrig_block, const basist::half_float* pPacked_block, const astc_hdr_codec_base_options& coptions);

	const uint32_t FIRST_MODE7_SUBMODE_INDEX = 0;
	const uint32_t MAX_MODE7_SUBMODE_INDEX = 5;

	bool pack_mode7(
	const vec3F& high_color_q16, const float s_q16,
	uint32_t ise_endpoint_range, uint8_t* pEndpoints,
	uint32_t ise_weight_range, // only used for determining biasing during CEM 7 packing
	const astc_hdr_codec_base_options& coptions,
	int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used);

	bool try_mode7(
	uint32_t num_pixels,
	uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
	const vec3F& high_color_q16, const float s_q16,
	const basist::half_float block_pixels_half[][3],
	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions,
	uint32_t ise_endpoint_range,
	int32_t first_submode = 0, int32_t last_submode = MAX_MODE7_SUBMODE_INDEX);

	bool pack_mode11(
	const vec3F& low_color_q16, const vec3F& high_color_q16,
	uint32_t ise_endpoint_range, uint8_t* pEndpoints,
	const astc_hdr_codec_base_options& coptions,
	bool direct_only, int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used);

	bool try_mode11(uint32_t num_pixels,
	uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
	const vec3F& low_color_q16, const vec3F& high_color_q16,
	const basist::half_float block_pixels_half[][3],
	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
	bool constrain_ise_weight_selectors,
	int32_t first_submode, int32_t last_submode, bool ignore_clamping);

	bool try_mode11_dual_plane(uint32_t channel_index, uint32_t num_pixels,
	uint8_t* pEndpoints, uint8_t* pWeights0, uint8_t* pWeights1, double& cur_block_error, uint32_t& submode_used,
	const vec3F& low_color_q16, const vec3F& high_color_q16,
	const basist::half_float block_pixels_half[][3],
	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
	bool constrain_ise_weight_selectors,
	int32_t first_submode, int32_t last_submode, bool ignore_clamping);

	const int FIRST_MODE11_SUBMODE_INDEX = -1;
	const int MAX_MODE11_SUBMODE_INDEX = 7;

	enum opt_mode_t
	{
	cNoOpt,
	cOrdinaryLeastSquares,
	cWeightedLeastSquares,
	cWeightedLeastSquaresHeavy,
	cWeightedAverage
	};

	struct encode_astc_block_stats
	{
	uint32_t m_num_pixels;
	vec3F m_mean_q16;
	vec3F m_axis_q16;

	void init(uint32_t num_pixels, const vec4F pBlock_pixels_q16[]);
	};

	double encode_astc_hdr_block_mode_11(
	uint32_t num_pixels,
	const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
	uint32_t ise_weight_range,
	uint32_t& best_submode,
	double cur_block_error,
	uint8_t* blk_endpoints, uint8_t* blk_weights,
	const astc_hdr_codec_base_options& coptions,
	bool direct_only,
	uint32_t ise_endpoint_range,
	bool uber_mode,
	bool constrain_ise_weight_selectors,
	int32_t first_submode, int32_t last_submode, bool ignore_clamping,
	opt_mode_t opt_mode,
	const encode_astc_block_stats *pBlock_stats = nullptr);

	double encode_astc_hdr_block_downsampled_mode_11(
	uint32_t block_x, uint32_t block_y, uint32_t grid_x, uint32_t grid_y,
	uint32_t ise_weight_range, uint32_t ise_endpoint_range,
	uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
	double cur_block_error,
	int32_t first_submode, int32_t last_submode, bool ignore_clamping, opt_mode_t opt_mode,
	uint8_t* pBlk_endpoints, uint8_t* pBlk_weights, uint32_t& best_submode,
	const astc_hdr_codec_base_options& coptions,
	const encode_astc_block_stats* pBlock_stats = nullptr);

	double encode_astc_hdr_block_mode_11_dual_plane(
	uint32_t num_pixels,
	const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
	uint32_t channel_index, // 0-2
	uint32_t ise_weight_range,
	uint32_t& best_submode,
	double cur_block_error,
	uint8_t* blk_endpoints, uint8_t* blk_weights0, uint8_t* blk_weights1,
	const astc_hdr_codec_base_options& coptions,
	bool direct_only,
	uint32_t ise_endpoint_range,
	bool uber_mode,
	bool constrain_ise_weight_selectors,
	int32_t first_submode, int32_t last_submode,
	bool ignore_clamping);

	double encode_astc_hdr_block_mode_7(
	uint32_t num_pixels,
	const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
	uint32_t ise_weight_range,
	uint32_t& best_submode,
	double cur_block_error,
	uint8_t* blk_endpoints, //[4]
	uint8_t* blk_weights, // [num_pixels]
	const astc_hdr_codec_base_options& coptions,
	uint32_t ise_endpoint_range,
	int first_submode = 0, int last_submode = MAX_MODE7_SUBMODE_INDEX,
	const encode_astc_block_stats *pBlock_stats = nullptr);

	//--------------------------------------------------------------------------------------------------------------------------

	struct mode11_log_desc
	{
	int32_t m_submode;
	int32_t m_maj_comp;

	// Or R0, G0, B0 if maj_comp==3 (direct)
	int32_t m_a; // positive
	int32_t m_c; // positive
	int32_t m_b0; // positive

	// Or R1, G1, B1 if maj_comp==3 (direct)
	int32_t m_b1; // positive
	int32_t m_d0; // if not direct, is signed
	int32_t m_d1; // if not direct, is signed

	// limits if not direct
	int32_t m_a_bits, m_c_bits, m_b_bits, m_d_bits;
	int32_t m_max_a_val, m_max_c_val, m_max_b_val, m_min_d_val, m_max_d_val;

	void clear() { clear_obj(*this); }

	bool is_direct() const { return m_maj_comp == 3; }
	};

	//--------------------------------------------------------------------------------------------------------------------------
	bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range, bool early_out_if_clamped, int max_clamp_mag_accept_thresh);

	bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, int val_q[2][3], int& max_clamp_mag, bool early_out_if_clamped = false, int max_clamp_mag_accept_thresh = 0);
	bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag, bool early_out_if_clamped = false, int max_clamp_mag_accept_thresh = 0);
	void pack_astc_mode11_direct(uint8_t* pEndpoints, vec3F l_q16, vec3F h_q16);

	bool pack_mode11(mode11_log_desc& desc, uint8_t* pEndpoints);
	void unpack_mode11(const uint8_t* pEndpoints, mode11_log_desc& desc);

	void decode_cem_11_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index);
	void decode_cem_7_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index);

	void dequantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights);

	const float* get_6x6_downsample_matrix(uint32_t grid_width, uint32_t grid_height);
	const float* get_8x6_downsample_matrix(uint32_t grid_width, uint32_t grid_height);

	void compute_upsample_matrix(basisu::vector2D<float>& upsample_matrix, uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height);
	void compute_upsample_matrix_transposed(basisu::vector<float>& unweighted_downsample_matrix, uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height);

	void compute_diag_AtA_vector(uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height, const vector2D<float>& upsample_matrix, float* pDst_vec);

	void downsample_weight_grid(
	const float* pMatrix_weights,
	uint32_t bx, uint32_t by, // source/from dimension (block size)
	uint32_t wx, uint32_t wy, // dest/to dimension (grid size)
	const uint8_t* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
	uint8_t* pDst_weights); // [wy][wx]

	void downsample_ise_weights(
	uint32_t weight_ise_range, uint32_t quant_weight_ise_range,
	uint32_t block_w, uint32_t block_h,
	uint32_t grid_w, uint32_t grid_h,
	const uint8_t* pSrc_weights, uint8_t* pDst_weights);

	void downsample_ise_weights_dual_plane(
	uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range,
	uint32_t block_w, uint32_t block_h,
	uint32_t grid_w, uint32_t grid_h,
	const uint8_t* pSrc_weights0, const uint8_t* pSrc_weights1,
	uint8_t* pDst_weights);

	bool refine_endpoints(
	uint32_t cem,
	uint32_t endpoint_ise_range,
	uint8_t* pEndpoint_vals, // the endpoints to optimize
	uint32_t block_w, uint32_t block_h, // block dimensions
	uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid
	uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[],
	const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets
	astc_hdr_codec_base_options& coptions, opt_mode_t opt_mode);

	extern bool g_astc_hdr_enc_initialized;

	// This MUST be called before encoding any blocks.
	void astc_hdr_enc_init();

	} // namespace basisu