| // basisu_enc.h |
| // Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| #pragma once |
| #include "../transcoder/basisu.h" |
| #include "../transcoder/basisu_transcoder_internal.h" |
| |
| #include <mutex> |
| #include <atomic> |
| #include <condition_variable> |
| #include <functional> |
| #include <thread> |
| #include <unordered_map> |
| #include <ostream> |
| |
| #if !defined(_WIN32) || defined(__MINGW32__) |
| #include <libgen.h> |
| #endif |
| |
| // This module is really just a huge grab bag of classes and helper functions needed by the encoder. |
| |
| // If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time. |
| #define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0) |
| |
| #if BASISU_SUPPORT_SSE |
| // Declared in basisu_kernels_imp.h, but we can't include that here otherwise it would lead to circular type errors. |
| extern void update_covar_matrix_16x16_sse41(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16); |
| #endif |
| |
| namespace basisu |
| { |
| extern uint8_t g_hamming_dist[256]; |
| extern const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8]; |
| |
| // true if basisu_encoder_init() has been called and returned. |
| extern bool g_library_initialized; |
| |
| // Encoder library initialization. |
| // This function MUST be called before encoding anything! |
| void basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false); |
| void basisu_encoder_deinit(); |
| |
| // basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1 |
| extern void detect_sse41(); |
| |
| #if BASISU_SUPPORT_SSE |
| extern bool g_cpu_supports_sse41; |
| #else |
| const bool g_cpu_supports_sse41 = false; |
| #endif |
| |
| void error_vprintf(const char* pFmt, va_list args); |
| void error_printf(const char *pFmt, ...); |
| |
| // Helpers |
| |
| inline uint8_t clamp255(int32_t i) |
| { |
| return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i); |
| } |
| |
| inline int32_t clampi(int32_t value, int32_t low, int32_t high) |
| { |
| if (value < low) |
| value = low; |
| else if (value > high) |
| value = high; |
| return value; |
| } |
| |
| inline uint8_t mul_8(uint32_t v, uint32_t a) |
| { |
| v = v * a + 128; |
| return (uint8_t)((v + (v >> 8)) >> 8); |
| } |
| |
| inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) |
| { |
| assert(codesize <= 64); |
| uint64_t bits = 0; |
| uint32_t total_bits = 0; |
| |
| while (total_bits < codesize) |
| { |
| uint32_t byte_bit_offset = bit_offset & 7; |
| uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset); |
| |
| uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset; |
| byte_bits &= ((1 << bits_to_read) - 1); |
| |
| bits |= ((uint64_t)(byte_bits) << total_bits); |
| |
| total_bits += bits_to_read; |
| bit_offset += bits_to_read; |
| } |
| |
| return bits; |
| } |
| |
| inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) |
| { |
| assert(codesize <= 32); |
| uint32_t bits = 0; |
| uint32_t total_bits = 0; |
| |
| while (total_bits < codesize) |
| { |
| uint32_t byte_bit_offset = bit_offset & 7; |
| uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset); |
| |
| uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset; |
| byte_bits &= ((1 << bits_to_read) - 1); |
| |
| bits |= (byte_bits << total_bits); |
| |
| total_bits += bits_to_read; |
| bit_offset += bits_to_read; |
| } |
| |
| return bits; |
| } |
| |
| // Hashing |
| |
| inline uint32_t bitmix32c(uint32_t v) |
| { |
| v = (v + 0x7ed55d16) + (v << 12); |
| v = (v ^ 0xc761c23c) ^ (v >> 19); |
| v = (v + 0x165667b1) + (v << 5); |
| v = (v + 0xd3a2646c) ^ (v << 9); |
| v = (v + 0xfd7046c5) + (v << 3); |
| v = (v ^ 0xb55a4f09) ^ (v >> 16); |
| return v; |
| } |
| |
| inline uint32_t bitmix32(uint32_t v) |
| { |
| v -= (v << 6); |
| v ^= (v >> 17); |
| v -= (v << 9); |
| v ^= (v << 4); |
| v -= (v << 3); |
| v ^= (v << 10); |
| v ^= (v >> 15); |
| return v; |
| } |
| |
| inline uint32_t wang_hash(uint32_t seed) |
| { |
| seed = (seed ^ 61) ^ (seed >> 16); |
| seed *= 9; |
| seed = seed ^ (seed >> 4); |
| seed *= 0x27d4eb2d; |
| seed = seed ^ (seed >> 15); |
| return seed; |
| } |
| |
| uint32_t hash_hsieh(const uint8_t* pBuf, size_t len); |
| |
| template <typename Key> |
| struct bit_hasher |
| { |
| std::size_t operator()(const Key& k) const |
| { |
| return hash_hsieh(reinterpret_cast<const uint8_t *>(&k), sizeof(k)); |
| } |
| }; |
| |
| class running_stat |
| { |
| public: |
| running_stat() { clear(); } |
| |
| void clear() |
| { |
| m_n = 0; |
| m_total = 0; |
| m_old_m = 0; |
| m_new_m = 0; |
| m_old_s = 0; |
| m_new_s = 0; |
| m_min = 0; |
| m_max = 0; |
| } |
| |
| void push(double x) |
| { |
| m_n++; |
| m_total += x; |
| if (m_n == 1) |
| { |
| m_old_m = m_new_m = x; |
| m_old_s = 0.0; |
| m_min = x; |
| m_max = x; |
| } |
| else |
| { |
| // See Knuth TAOCP vol 2, 3rd edition, page 232 |
| m_new_m = m_old_m + (x - m_old_m) / m_n; |
| m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m); |
| m_old_m = m_new_m; |
| m_old_s = m_new_s; |
| m_min = basisu::minimum(x, m_min); |
| m_max = basisu::maximum(x, m_max); |
| } |
| } |
| |
| uint32_t get_num() const |
| { |
| return m_n; |
| } |
| |
| double get_total() const |
| { |
| return m_total; |
| } |
| |
| double get_mean() const |
| { |
| return (m_n > 0) ? m_new_m : 0.0; |
| } |
| |
| // Returns sample variance |
| double get_variance() const |
| { |
| return ((m_n > 1) ? m_new_s / (m_n - 1) : 0.0); |
| } |
| |
| double get_std_dev() const |
| { |
| return sqrt(get_variance()); |
| } |
| |
| double get_min() const |
| { |
| return m_min; |
| } |
| |
| double get_max() const |
| { |
| return m_max; |
| } |
| |
| private: |
| uint32_t m_n; |
| double m_total, m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max; |
| }; |
| |
| // Linear algebra |
| |
| template <uint32_t N, typename T> |
| class vec |
| { |
| protected: |
| T m_v[N]; |
| |
| public: |
| enum { num_elements = N }; |
| |
| inline vec() { } |
| inline vec(eZero) { set_zero(); } |
| |
| explicit inline vec(T val) { set(val); } |
| inline vec(T v0, T v1) { set(v0, v1); } |
| inline vec(T v0, T v1, T v2) { set(v0, v1, v2); } |
| inline vec(T v0, T v1, T v2, T v3) { set(v0, v1, v2, v3); } |
| inline vec(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] = other.m_v[i]; } |
| template <uint32_t OtherN, typename OtherT> inline vec(const vec<OtherN, OtherT> &other) { set(other); } |
| |
| inline T operator[](uint32_t i) const { assert(i < N); return m_v[i]; } |
| inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; } |
| |
| inline T getX() const { return m_v[0]; } |
| inline T getY() const { static_assert(N >= 2, "N too small"); return m_v[1]; } |
| inline T getZ() const { static_assert(N >= 3, "N too small"); return m_v[2]; } |
| inline T getW() const { static_assert(N >= 4, "N too small"); return m_v[3]; } |
| |
| inline bool operator==(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false; return true; } |
| inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; } |
| |
| inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; } |
| |
| template <uint32_t OtherN, typename OtherT> |
| inline vec &set(const vec<OtherN, OtherT> &other) |
| { |
| uint32_t i; |
| if ((const void *)(&other) == (const void *)(this)) |
| return *this; |
| const uint32_t m = minimum(OtherN, N); |
| for (i = 0; i < m; i++) |
| m_v[i] = static_cast<T>(other[i]); |
| for (; i < N; i++) |
| m_v[i] = 0; |
| return *this; |
| } |
| |
| inline vec &set_component(uint32_t index, T val) { assert(index < N); m_v[index] = val; return *this; } |
| inline vec &set(T val) { for (uint32_t i = 0; i < N; i++) m_v[i] = val; return *this; } |
| inline void clear_elements(uint32_t s, uint32_t e) { assert(e <= N); for (uint32_t i = s; i < e; i++) m_v[i] = 0; } |
| |
| inline vec &set(T v0, T v1) |
| { |
| m_v[0] = v0; |
| if (N >= 2) |
| { |
| m_v[1] = v1; |
| clear_elements(2, N); |
| } |
| return *this; |
| } |
| |
| inline vec &set(T v0, T v1, T v2) |
| { |
| m_v[0] = v0; |
| if (N >= 2) |
| { |
| m_v[1] = v1; |
| if (N >= 3) |
| { |
| m_v[2] = v2; |
| clear_elements(3, N); |
| } |
| } |
| return *this; |
| } |
| |
| inline vec &set(T v0, T v1, T v2, T v3) |
| { |
| m_v[0] = v0; |
| if (N >= 2) |
| { |
| m_v[1] = v1; |
| if (N >= 3) |
| { |
| m_v[2] = v2; |
| |
| if (N >= 4) |
| { |
| m_v[3] = v3; |
| clear_elements(5, N); |
| } |
| } |
| } |
| return *this; |
| } |
| |
| inline vec &operator=(const vec &rhs) { if (this != &rhs) for (uint32_t i = 0; i < N; i++) m_v[i] = rhs.m_v[i]; return *this; } |
| template <uint32_t OtherN, typename OtherT> inline vec &operator=(const vec<OtherN, OtherT> &rhs) { set(rhs); return *this; } |
| |
| inline const T *get_ptr() const { return reinterpret_cast<const T *>(&m_v[0]); } |
| inline T *get_ptr() { return reinterpret_cast<T *>(&m_v[0]); } |
| |
| inline vec operator- () const { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = -m_v[i]; return res; } |
| inline vec operator+ () const { return *this; } |
| inline vec &operator+= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] += other.m_v[i]; return *this; } |
| inline vec &operator-= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] -= other.m_v[i]; return *this; } |
| inline vec &operator/= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] /= other.m_v[i]; return *this; } |
| inline vec &operator*=(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] *= other.m_v[i]; return *this; } |
| inline vec &operator/= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] /= s; return *this; } |
| inline vec &operator*= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] *= s; return *this; } |
| |
| friend inline vec operator+(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] + rhs.m_v[i]; return res; } |
| friend inline vec operator-(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] - rhs.m_v[i]; return res; } |
| friend inline vec operator*(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] * val; return res; } |
| friend inline vec operator*(T val, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = val * rhs.m_v[i]; return res; } |
| friend inline vec operator/(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / val; return res; } |
| friend inline vec operator/(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / rhs.m_v[i]; return res; } |
| |
| static inline T dot_product(const vec &lhs, const vec &rhs) { T res = lhs.m_v[0] * rhs.m_v[0]; for (uint32_t i = 1; i < N; i++) res += lhs.m_v[i] * rhs.m_v[i]; return res; } |
| |
| inline T dot(const vec &rhs) const { return dot_product(*this, rhs); } |
| |
| inline T norm() const { return dot_product(*this, *this); } |
| inline T length() const { return sqrt(norm()); } |
| |
| inline T squared_distance(const vec &other) const { T d2 = 0; for (uint32_t i = 0; i < N; i++) { T d = m_v[i] - other.m_v[i]; d2 += d * d; } return d2; } |
| inline double squared_distance_d(const vec& other) const { double d2 = 0; for (uint32_t i = 0; i < N; i++) { double d = (double)m_v[i] - (double)other.m_v[i]; d2 += d * d; } return d2; } |
| |
| inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); } |
| inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); } |
| |
| inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; } |
| |
| inline vec &clamp(T l, T h) |
| { |
| for (uint32_t i = 0; i < N; i++) |
| m_v[i] = basisu::clamp(m_v[i], l, h); |
| return *this; |
| } |
| |
| static vec component_min(const vec& a, const vec& b) |
| { |
| vec res; |
| for (uint32_t i = 0; i < N; i++) |
| res[i] = minimum(a[i], b[i]); |
| return res; |
| } |
| |
| static vec component_max(const vec& a, const vec& b) |
| { |
| vec res; |
| for (uint32_t i = 0; i < N; i++) |
| res[i] = maximum(a[i], b[i]); |
| return res; |
| } |
| }; |
| |
| typedef vec<4, double> vec4D; |
| typedef vec<3, double> vec3D; |
| typedef vec<2, double> vec2D; |
| typedef vec<1, double> vec1D; |
| |
| typedef vec<4, float> vec4F; |
| typedef vec<3, float> vec3F; |
| typedef vec<2, float> vec2F; |
| typedef vec<1, float> vec1F; |
| |
| typedef vec<16, float> vec16F; |
| |
| template <uint32_t Rows, uint32_t Cols, typename T> |
| class matrix |
| { |
| public: |
| typedef vec<Rows, T> col_vec; |
| typedef vec<Cols, T> row_vec; |
| |
| typedef T scalar_type; |
| |
| enum { rows = Rows, cols = Cols }; |
| |
| protected: |
| row_vec m_r[Rows]; |
| |
| public: |
| inline matrix() {} |
| inline matrix(eZero) { set_zero(); } |
| inline matrix(const matrix &other) { for (uint32_t i = 0; i < Rows; i++) m_r[i] = other.m_r[i]; } |
| inline matrix &operator=(const matrix &rhs) { if (this != &rhs) for (uint32_t i = 0; i < Rows; i++) m_r[i] = rhs.m_r[i]; return *this; } |
| |
| inline T operator()(uint32_t r, uint32_t c) const { assert((r < Rows) && (c < Cols)); return m_r[r][c]; } |
| inline T &operator()(uint32_t r, uint32_t c) { assert((r < Rows) && (c < Cols)); return m_r[r][c]; } |
| |
| inline const row_vec &operator[](uint32_t r) const { assert(r < Rows); return m_r[r]; } |
| inline row_vec &operator[](uint32_t r) { assert(r < Rows); return m_r[r]; } |
| |
| inline matrix &set_zero() |
| { |
| for (uint32_t i = 0; i < Rows; i++) |
| m_r[i].set_zero(); |
| return *this; |
| } |
| |
| inline matrix &set_identity() |
| { |
| for (uint32_t i = 0; i < Rows; i++) |
| { |
| m_r[i].set_zero(); |
| if (i < Cols) |
| m_r[i][i] = 1.0f; |
| } |
| return *this; |
| } |
| }; |
| |
| template<uint32_t N, typename VectorType> |
| inline VectorType compute_pca_from_covar(matrix<N, N, float> &cmatrix) |
| { |
| VectorType axis; |
| if (N == 1) |
| axis.set(1.0f); |
| else |
| { |
| for (uint32_t i = 0; i < N; i++) |
| axis[i] = lerp(.75f, 1.25f, i * (1.0f / maximum<int>(N - 1, 1))); |
| } |
| |
| VectorType prev_axis(axis); |
| |
| // Power iterations |
| for (uint32_t power_iter = 0; power_iter < 8; power_iter++) |
| { |
| VectorType trial_axis; |
| double max_sum = 0; |
| |
| for (uint32_t i = 0; i < N; i++) |
| { |
| double sum = 0; |
| for (uint32_t j = 0; j < N; j++) |
| sum += cmatrix[i][j] * axis[j]; |
| |
| trial_axis[i] = static_cast<float>(sum); |
| |
| max_sum = maximum(fabs(sum), max_sum); |
| } |
| |
| if (max_sum != 0.0f) |
| trial_axis *= static_cast<float>(1.0f / max_sum); |
| |
| VectorType delta_axis(prev_axis - trial_axis); |
| |
| prev_axis = axis; |
| axis = trial_axis; |
| |
| if (delta_axis.norm() < .0024f) |
| break; |
| } |
| |
| return axis.normalize_in_place(); |
| } |
| |
| template<typename T> inline void indirect_sort(uint32_t num_indices, uint32_t* pIndices, const T* pKeys) |
| { |
| for (uint32_t i = 0; i < num_indices; i++) |
| pIndices[i] = i; |
| |
| std::sort( |
| pIndices, |
| pIndices + num_indices, |
| [pKeys](uint32_t a, uint32_t b) { return pKeys[a] < pKeys[b]; } |
| ); |
| } |
| |
| // 1-4 byte direct Radix sort. |
| template <typename T> |
| T* radix_sort(uint32_t num_vals, T* pBuf0, T* pBuf1, uint32_t key_ofs, uint32_t key_size) |
| { |
| assert(key_ofs < sizeof(T)); |
| assert((key_size >= 1) && (key_size <= 4)); |
| |
| uint32_t hist[256 * 4]; |
| |
| memset(hist, 0, sizeof(hist[0]) * 256 * key_size); |
| |
| #define BASISU_GET_KEY(p) (*(uint32_t *)((uint8_t *)(p) + key_ofs)) |
| |
| if (key_size == 4) |
| { |
| T* p = pBuf0; |
| T* q = pBuf0 + num_vals; |
| for (; p != q; p++) |
| { |
| const uint32_t key = BASISU_GET_KEY(p); |
| |
| hist[key & 0xFF]++; |
| hist[256 + ((key >> 8) & 0xFF)]++; |
| hist[512 + ((key >> 16) & 0xFF)]++; |
| hist[768 + ((key >> 24) & 0xFF)]++; |
| } |
| } |
| else if (key_size == 3) |
| { |
| T* p = pBuf0; |
| T* q = pBuf0 + num_vals; |
| for (; p != q; p++) |
| { |
| const uint32_t key = BASISU_GET_KEY(p); |
| |
| hist[key & 0xFF]++; |
| hist[256 + ((key >> 8) & 0xFF)]++; |
| hist[512 + ((key >> 16) & 0xFF)]++; |
| } |
| } |
| else if (key_size == 2) |
| { |
| T* p = pBuf0; |
| T* q = pBuf0 + (num_vals >> 1) * 2; |
| |
| for (; p != q; p += 2) |
| { |
| const uint32_t key0 = BASISU_GET_KEY(p); |
| const uint32_t key1 = BASISU_GET_KEY(p + 1); |
| |
| hist[key0 & 0xFF]++; |
| hist[256 + ((key0 >> 8) & 0xFF)]++; |
| |
| hist[key1 & 0xFF]++; |
| hist[256 + ((key1 >> 8) & 0xFF)]++; |
| } |
| |
| if (num_vals & 1) |
| { |
| const uint32_t key = BASISU_GET_KEY(p); |
| |
| hist[key & 0xFF]++; |
| hist[256 + ((key >> 8) & 0xFF)]++; |
| } |
| } |
| else |
| { |
| assert(key_size == 1); |
| if (key_size != 1) |
| return NULL; |
| |
| T* p = pBuf0; |
| T* q = pBuf0 + (num_vals >> 1) * 2; |
| |
| for (; p != q; p += 2) |
| { |
| const uint32_t key0 = BASISU_GET_KEY(p); |
| const uint32_t key1 = BASISU_GET_KEY(p + 1); |
| |
| hist[key0 & 0xFF]++; |
| hist[key1 & 0xFF]++; |
| } |
| |
| if (num_vals & 1) |
| { |
| const uint32_t key = BASISU_GET_KEY(p); |
| hist[key & 0xFF]++; |
| } |
| } |
| |
| T* pCur = pBuf0; |
| T* pNew = pBuf1; |
| |
| for (uint32_t pass = 0; pass < key_size; pass++) |
| { |
| const uint32_t* pHist = &hist[pass << 8]; |
| |
| uint32_t offsets[256]; |
| |
| uint32_t cur_ofs = 0; |
| for (uint32_t i = 0; i < 256; i += 2) |
| { |
| offsets[i] = cur_ofs; |
| cur_ofs += pHist[i]; |
| |
| offsets[i + 1] = cur_ofs; |
| cur_ofs += pHist[i + 1]; |
| } |
| |
| const uint32_t pass_shift = pass << 3; |
| |
| T* p = pCur; |
| T* q = pCur + (num_vals >> 1) * 2; |
| |
| for (; p != q; p += 2) |
| { |
| uint32_t c0 = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF; |
| uint32_t c1 = (BASISU_GET_KEY(p + 1) >> pass_shift) & 0xFF; |
| |
| if (c0 == c1) |
| { |
| uint32_t dst_offset0 = offsets[c0]; |
| |
| offsets[c0] = dst_offset0 + 2; |
| |
| pNew[dst_offset0] = p[0]; |
| pNew[dst_offset0 + 1] = p[1]; |
| } |
| else |
| { |
| uint32_t dst_offset0 = offsets[c0]++; |
| uint32_t dst_offset1 = offsets[c1]++; |
| |
| pNew[dst_offset0] = p[0]; |
| pNew[dst_offset1] = p[1]; |
| } |
| } |
| |
| if (num_vals & 1) |
| { |
| uint32_t c = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF; |
| |
| uint32_t dst_offset = offsets[c]; |
| offsets[c] = dst_offset + 1; |
| |
| pNew[dst_offset] = *p; |
| } |
| |
| T* t = pCur; |
| pCur = pNew; |
| pNew = t; |
| } |
| |
| return pCur; |
| } |
| |
| #undef BASISU_GET_KEY |
| |
| // Very simple job pool with no dependencies. |
| class job_pool |
| { |
| BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool); |
| |
| public: |
| // num_threads is the TOTAL number of job pool threads, including the calling thread! So 2=1 new thread, 3=2 new threads, etc. |
| job_pool(uint32_t num_threads); |
| ~job_pool(); |
| |
| void add_job(const std::function<void()>& job); |
| void add_job(std::function<void()>&& job); |
| |
| void wait_for_all(); |
| |
| size_t get_total_threads() const { return 1 + m_threads.size(); } |
| |
| private: |
| std::vector<std::thread> m_threads; |
| std::vector<std::function<void()> > m_queue; |
| |
| std::mutex m_mutex; |
| std::condition_variable m_has_work; |
| std::condition_variable m_no_more_jobs; |
| |
| uint32_t m_num_active_jobs; |
| |
| std::atomic<bool> m_kill_flag; |
| |
| void job_thread(uint32_t index); |
| }; |
| |
| // Simple 32-bit color class |
| |
| class color_rgba_i16 |
| { |
| public: |
| union |
| { |
| int16_t m_comps[4]; |
| |
| struct |
| { |
| int16_t r; |
| int16_t g; |
| int16_t b; |
| int16_t a; |
| }; |
| }; |
| |
| inline color_rgba_i16() |
| { |
| static_assert(sizeof(*this) == sizeof(int16_t)*4, "sizeof(*this) == sizeof(int16_t)*4"); |
| } |
| |
| inline color_rgba_i16(int sr, int sg, int sb, int sa) |
| { |
| set(sr, sg, sb, sa); |
| } |
| |
| inline color_rgba_i16 &set(int sr, int sg, int sb, int sa) |
| { |
| m_comps[0] = (int16_t)clamp<int>(sr, INT16_MIN, INT16_MAX); |
| m_comps[1] = (int16_t)clamp<int>(sg, INT16_MIN, INT16_MAX); |
| m_comps[2] = (int16_t)clamp<int>(sb, INT16_MIN, INT16_MAX); |
| m_comps[3] = (int16_t)clamp<int>(sa, INT16_MIN, INT16_MAX); |
| return *this; |
| } |
| }; |
| |
| class color_rgba |
| { |
| public: |
| union |
| { |
| uint8_t m_comps[4]; |
| |
| struct |
| { |
| uint8_t r; |
| uint8_t g; |
| uint8_t b; |
| uint8_t a; |
| }; |
| }; |
| |
| inline color_rgba() |
| { |
| static_assert(sizeof(*this) == 4, "sizeof(*this) != 4"); |
| static_assert(sizeof(*this) == sizeof(basist::color32), "sizeof(*this) != sizeof(basist::color32)"); |
| } |
| |
| // Not too hot about this idea. |
| inline color_rgba(const basist::color32& other) : |
| r(other.r), |
| g(other.g), |
| b(other.b), |
| a(other.a) |
| { |
| } |
| |
| color_rgba& operator= (const basist::color32& rhs) |
| { |
| r = rhs.r; |
| g = rhs.g; |
| b = rhs.b; |
| a = rhs.a; |
| return *this; |
| } |
| |
| inline color_rgba(int y) |
| { |
| set(y); |
| } |
| |
| inline color_rgba(int y, int na) |
| { |
| set(y, na); |
| } |
| |
| inline color_rgba(int sr, int sg, int sb, int sa) |
| { |
| set(sr, sg, sb, sa); |
| } |
| |
| inline color_rgba(eNoClamp, int sr, int sg, int sb, int sa) |
| { |
| set_noclamp_rgba((uint8_t)sr, (uint8_t)sg, (uint8_t)sb, (uint8_t)sa); |
| } |
| |
| inline color_rgba& set_noclamp_y(int y) |
| { |
| m_comps[0] = (uint8_t)y; |
| m_comps[1] = (uint8_t)y; |
| m_comps[2] = (uint8_t)y; |
| m_comps[3] = (uint8_t)255; |
| return *this; |
| } |
| |
| inline color_rgba &set_noclamp_rgba(int sr, int sg, int sb, int sa) |
| { |
| m_comps[0] = (uint8_t)sr; |
| m_comps[1] = (uint8_t)sg; |
| m_comps[2] = (uint8_t)sb; |
| m_comps[3] = (uint8_t)sa; |
| return *this; |
| } |
| |
| inline color_rgba &set(int y) |
| { |
| m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255)); |
| m_comps[1] = m_comps[0]; |
| m_comps[2] = m_comps[0]; |
| m_comps[3] = 255; |
| return *this; |
| } |
| |
| inline color_rgba &set(int y, int na) |
| { |
| m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255)); |
| m_comps[1] = m_comps[0]; |
| m_comps[2] = m_comps[0]; |
| m_comps[3] = static_cast<uint8_t>(clamp<int>(na, 0, 255)); |
| return *this; |
| } |
| |
| inline color_rgba &set(int sr, int sg, int sb, int sa) |
| { |
| m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255)); |
| m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255)); |
| m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255)); |
| m_comps[3] = static_cast<uint8_t>(clamp<int>(sa, 0, 255)); |
| return *this; |
| } |
| |
| inline color_rgba &set_rgb(int sr, int sg, int sb) |
| { |
| m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255)); |
| m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255)); |
| m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255)); |
| return *this; |
| } |
| |
| inline color_rgba &set_rgb(const color_rgba &other) |
| { |
| r = other.r; |
| g = other.g; |
| b = other.b; |
| return *this; |
| } |
| |
| inline const uint8_t &operator[] (uint32_t index) const { assert(index < 4); return m_comps[index]; } |
| inline uint8_t &operator[] (uint32_t index) { assert(index < 4); return m_comps[index]; } |
| |
| inline void clear() |
| { |
| m_comps[0] = 0; |
| m_comps[1] = 0; |
| m_comps[2] = 0; |
| m_comps[3] = 0; |
| } |
| |
| inline bool operator== (const color_rgba &rhs) const |
| { |
| if (m_comps[0] != rhs.m_comps[0]) return false; |
| if (m_comps[1] != rhs.m_comps[1]) return false; |
| if (m_comps[2] != rhs.m_comps[2]) return false; |
| if (m_comps[3] != rhs.m_comps[3]) return false; |
| return true; |
| } |
| |
| inline bool operator!= (const color_rgba &rhs) const |
| { |
| return !(*this == rhs); |
| } |
| |
| inline bool operator<(const color_rgba &rhs) const |
| { |
| for (int i = 0; i < 4; i++) |
| { |
| if (m_comps[i] < rhs.m_comps[i]) |
| return true; |
| else if (m_comps[i] != rhs.m_comps[i]) |
| return false; |
| } |
| return false; |
| } |
| |
| inline int get_601_luma() const { return (19595U * m_comps[0] + 38470U * m_comps[1] + 7471U * m_comps[2] + 32768U) >> 16U; } |
| inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; } |
| inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); } |
| |
| inline basist::color32 get_color32() const |
| { |
| return basist::color32(r, g, b, a); |
| } |
| |
| static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); } |
| static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); } |
| }; |
| |
| typedef basisu::vector<color_rgba> color_rgba_vec; |
| |
| const color_rgba g_black_color(0, 0, 0, 255); |
| const color_rgba g_black_trans_color(0, 0, 0, 0); |
| const color_rgba g_white_color(255, 255, 255, 255); |
| |
| inline int color_distance(int r0, int g0, int b0, int r1, int g1, int b1) |
| { |
| int dr = r0 - r1, dg = g0 - g1, db = b0 - b1; |
| return dr * dr + dg * dg + db * db; |
| } |
| |
| inline int color_distance(int r0, int g0, int b0, int a0, int r1, int g1, int b1, int a1) |
| { |
| int dr = r0 - r1, dg = g0 - g1, db = b0 - b1, da = a0 - a1; |
| return dr * dr + dg * dg + db * db + da * da; |
| } |
| |
| inline int color_distance(const color_rgba &c0, const color_rgba &c1, bool alpha) |
| { |
| if (alpha) |
| return color_distance(c0.r, c0.g, c0.b, c0.a, c1.r, c1.g, c1.b, c1.a); |
| else |
| return color_distance(c0.r, c0.g, c0.b, c1.r, c1.g, c1.b); |
| } |
| |
| // TODO: Allow user to control channel weightings. |
| inline uint32_t color_distance(bool perceptual, const color_rgba &e1, const color_rgba &e2, bool alpha) |
| { |
| if (perceptual) |
| { |
| #if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE |
| const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f; |
| const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f; |
| |
| const float cr1 = e1.r - l1; |
| const float cr2 = e2.r - l2; |
| |
| const float cb1 = e1.b - l1; |
| const float cb2 = e2.b - l2; |
| |
| const float dl = l1 - l2; |
| const float dcr = cr1 - cr2; |
| const float dcb = cb1 - cb2; |
| |
| uint32_t d = static_cast<uint32_t>(32.0f*4.0f*dl*dl + 32.0f*2.0f*(.5f / (1.0f - .2126f))*(.5f / (1.0f - .2126f))*dcr*dcr + 32.0f*.25f*(.5f / (1.0f - .0722f))*(.5f / (1.0f - .0722f))*dcb*dcb); |
| |
| if (alpha) |
| { |
| int da = static_cast<int>(e1.a) - static_cast<int>(e2.a); |
| d += static_cast<uint32_t>(128.0f*da*da); |
| } |
| |
| return d; |
| #elif 1 |
| int dr = e1.r - e2.r; |
| int dg = e1.g - e2.g; |
| int db = e1.b - e2.b; |
| |
| int delta_l = dr * 27 + dg * 92 + db * 9; |
| int delta_cr = dr * 128 - delta_l; |
| int delta_cb = db * 128 - delta_l; |
| |
| uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) + |
| ((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) + |
| ((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U); |
| |
| if (alpha) |
| { |
| int da = (e1.a - e2.a) << 7; |
| id += ((uint32_t)(da * da) >> 7U); |
| } |
| |
| return id; |
| #else |
| int dr = e1.r - e2.r; |
| int dg = e1.g - e2.g; |
| int db = e1.b - e2.b; |
| |
| int64_t delta_l = dr * 27 + dg * 92 + db * 9; |
| int64_t delta_cr = dr * 128 - delta_l; |
| int64_t delta_cb = db * 128 - delta_l; |
| |
| int64_t id = ((delta_l * delta_l) * 128) + |
| ((delta_cr * delta_cr) * 26) + |
| ((delta_cb * delta_cb) * 3); |
| |
| if (alpha) |
| { |
| int64_t da = (e1.a - e2.a); |
| id += (da * da) * 128; |
| } |
| |
| int d = (id + 8192) >> 14; |
| |
| return d; |
| #endif |
| } |
| else |
| return color_distance(e1, e2, alpha); |
| } |
| |
| static inline uint32_t color_distance_la(const color_rgba& a, const color_rgba& b) |
| { |
| const int dl = a.r - b.r; |
| const int da = a.a - b.a; |
| return dl * dl + da * da; |
| } |
| |
| // String helpers |
| |
| inline int string_find_right(const std::string& filename, char c) |
| { |
| size_t result = filename.find_last_of(c); |
| return (result == std::string::npos) ? -1 : (int)result; |
| } |
| |
| inline std::string string_get_extension(const std::string &filename) |
| { |
| int sep = -1; |
| #ifdef _WIN32 |
| sep = string_find_right(filename, '\\'); |
| #endif |
| if (sep < 0) |
| sep = string_find_right(filename, '/'); |
| |
| int dot = string_find_right(filename, '.'); |
| if (dot <= sep) |
| return ""; |
| |
| std::string result(filename); |
| result.erase(0, dot + 1); |
| |
| return result; |
| } |
| |
| inline bool string_remove_extension(std::string &filename) |
| { |
| int sep = -1; |
| #ifdef _WIN32 |
| sep = string_find_right(filename, '\\'); |
| #endif |
| if (sep < 0) |
| sep = string_find_right(filename, '/'); |
| |
| int dot = string_find_right(filename, '.'); |
| if ((dot < sep) || (dot < 0)) |
| return false; |
| |
| filename.resize(dot); |
| |
| return true; |
| } |
| |
| inline std::string string_format(const char* pFmt, ...) |
| { |
| char buf[2048]; |
| |
| va_list args; |
| va_start(args, pFmt); |
| #ifdef _WIN32 |
| vsprintf_s(buf, sizeof(buf), pFmt, args); |
| #else |
| vsnprintf(buf, sizeof(buf), pFmt, args); |
| #endif |
| va_end(args); |
| |
| return std::string(buf); |
| } |
| |
| inline std::string string_tolower(const std::string& s) |
| { |
| std::string result(s); |
| for (size_t i = 0; i < result.size(); i++) |
| result[i] = (char)tolower((int)result[i]); |
| return result; |
| } |
| |
| inline char *strcpy_safe(char *pDst, size_t dst_len, const char *pSrc) |
| { |
| assert(pDst && pSrc && dst_len); |
| if (!dst_len) |
| return pDst; |
| |
| const size_t src_len = strlen(pSrc); |
| const size_t src_len_plus_terminator = src_len + 1; |
| |
| if (src_len_plus_terminator <= dst_len) |
| memcpy(pDst, pSrc, src_len_plus_terminator); |
| else |
| { |
| if (dst_len > 1) |
| memcpy(pDst, pSrc, dst_len - 1); |
| pDst[dst_len - 1] = '\0'; |
| } |
| |
| return pDst; |
| } |
| |
| inline bool string_ends_with(const std::string& s, char c) |
| { |
| return (s.size() != 0) && (s.back() == c); |
| } |
| |
| inline bool string_split_path(const char *p, std::string *pDrive, std::string *pDir, std::string *pFilename, std::string *pExt) |
| { |
| #ifdef _MSC_VER |
| char drive_buf[_MAX_DRIVE] = { 0 }; |
| char dir_buf[_MAX_DIR] = { 0 }; |
| char fname_buf[_MAX_FNAME] = { 0 }; |
| char ext_buf[_MAX_EXT] = { 0 }; |
| |
| errno_t error = _splitpath_s(p, |
| pDrive ? drive_buf : NULL, pDrive ? _MAX_DRIVE : 0, |
| pDir ? dir_buf : NULL, pDir ? _MAX_DIR : 0, |
| pFilename ? fname_buf : NULL, pFilename ? _MAX_FNAME : 0, |
| pExt ? ext_buf : NULL, pExt ? _MAX_EXT : 0); |
| if (error != 0) |
| return false; |
| |
| if (pDrive) *pDrive = drive_buf; |
| if (pDir) *pDir = dir_buf; |
| if (pFilename) *pFilename = fname_buf; |
| if (pExt) *pExt = ext_buf; |
| return true; |
| #else |
| char dirtmp[1024], nametmp[1024]; |
| strcpy_safe(dirtmp, sizeof(dirtmp), p); |
| strcpy_safe(nametmp, sizeof(nametmp), p); |
| |
| if (pDrive) |
| pDrive->resize(0); |
| |
| const char *pDirName = dirname(dirtmp); |
| const char* pBaseName = basename(nametmp); |
| if ((!pDirName) || (!pBaseName)) |
| return false; |
| |
| if (pDir) |
| { |
| *pDir = pDirName; |
| if ((pDir->size()) && (pDir->back() != '/')) |
| *pDir += "/"; |
| } |
| |
| if (pFilename) |
| { |
| *pFilename = pBaseName; |
| string_remove_extension(*pFilename); |
| } |
| |
| if (pExt) |
| { |
| *pExt = pBaseName; |
| *pExt = string_get_extension(*pExt); |
| if (pExt->size()) |
| *pExt = "." + *pExt; |
| } |
| |
| return true; |
| #endif |
| } |
| |
| inline bool is_path_separator(char c) |
| { |
| #ifdef _WIN32 |
| return (c == '/') || (c == '\\'); |
| #else |
| return (c == '/'); |
| #endif |
| } |
| |
| inline bool is_drive_separator(char c) |
| { |
| #ifdef _WIN32 |
| return (c == ':'); |
| #else |
| (void)c; |
| return false; |
| #endif |
| } |
| |
| inline void string_combine_path(std::string &dst, const char *p, const char *q) |
| { |
| std::string temp(p); |
| if (temp.size() && !is_path_separator(q[0])) |
| { |
| if (!is_path_separator(temp.back())) |
| temp.append(1, BASISU_PATH_SEPERATOR_CHAR); |
| } |
| temp += q; |
| dst.swap(temp); |
| } |
| |
| inline void string_combine_path(std::string &dst, const char *p, const char *q, const char *r) |
| { |
| string_combine_path(dst, p, q); |
| string_combine_path(dst, dst.c_str(), r); |
| } |
| |
| inline void string_combine_path_and_extension(std::string &dst, const char *p, const char *q, const char *r, const char *pExt) |
| { |
| string_combine_path(dst, p, q, r); |
| if ((!string_ends_with(dst, '.')) && (pExt[0]) && (pExt[0] != '.')) |
| dst.append(1, '.'); |
| dst.append(pExt); |
| } |
| |
| inline bool string_get_pathname(const char *p, std::string &path) |
| { |
| std::string temp_drive, temp_path; |
| if (!string_split_path(p, &temp_drive, &temp_path, NULL, NULL)) |
| return false; |
| string_combine_path(path, temp_drive.c_str(), temp_path.c_str()); |
| return true; |
| } |
| |
| inline bool string_get_filename(const char *p, std::string &filename) |
| { |
| std::string temp_ext; |
| if (!string_split_path(p, nullptr, nullptr, &filename, &temp_ext)) |
| return false; |
| filename += temp_ext; |
| return true; |
| } |
| |
| class rand |
| { |
| std::mt19937 m_mt; |
| |
| public: |
| rand() { } |
| |
| rand(uint32_t s) { seed(s); } |
| void seed(uint32_t s) { m_mt.seed(s); } |
| |
| // between [l,h] |
| int irand(int l, int h) { std::uniform_int_distribution<int> d(l, h); return d(m_mt); } |
| |
| uint32_t urand32() { return static_cast<uint32_t>(irand(INT32_MIN, INT32_MAX)); } |
| |
| bool bit() { return irand(0, 1) == 1; } |
| |
| uint8_t byte() { return static_cast<uint8_t>(urand32()); } |
| |
| // between [l,h) |
| float frand(float l, float h) { std::uniform_real_distribution<float> d(l, h); return d(m_mt); } |
| |
| float gaussian(float mean, float stddev) { std::normal_distribution<float> d(mean, stddev); return d(m_mt); } |
| }; |
| |
| class priority_queue |
| { |
| public: |
| priority_queue() : |
| m_size(0) |
| { |
| } |
| |
| void clear() |
| { |
| m_heap.clear(); |
| m_size = 0; |
| } |
| |
| void init(uint32_t max_entries, uint32_t first_index, float first_priority) |
| { |
| m_heap.resize(max_entries + 1); |
| m_heap[1].m_index = first_index; |
| m_heap[1].m_priority = first_priority; |
| m_size = 1; |
| } |
| |
| inline uint32_t size() const { return m_size; } |
| |
| inline uint32_t get_top_index() const { return m_heap[1].m_index; } |
| inline float get_top_priority() const { return m_heap[1].m_priority; } |
| |
| inline void delete_top() |
| { |
| assert(m_size > 0); |
| m_heap[1] = m_heap[m_size]; |
| m_size--; |
| if (m_size) |
| down_heap(1); |
| } |
| |
| inline void add_heap(uint32_t index, float priority) |
| { |
| m_size++; |
| |
| uint32_t k = m_size; |
| |
| if (m_size >= m_heap.size()) |
| m_heap.resize(m_size + 1); |
| |
| for (;;) |
| { |
| uint32_t parent_index = k >> 1; |
| if ((!parent_index) || (m_heap[parent_index].m_priority > priority)) |
| break; |
| m_heap[k] = m_heap[parent_index]; |
| k = parent_index; |
| } |
| |
| m_heap[k].m_index = index; |
| m_heap[k].m_priority = priority; |
| } |
| |
| private: |
| struct entry |
| { |
| uint32_t m_index; |
| float m_priority; |
| }; |
| |
| basisu::vector<entry> m_heap; |
| uint32_t m_size; |
| |
| // Push down entry at index |
| inline void down_heap(uint32_t heap_index) |
| { |
| uint32_t orig_index = m_heap[heap_index].m_index; |
| const float orig_priority = m_heap[heap_index].m_priority; |
| |
| uint32_t child_index; |
| while ((child_index = (heap_index << 1)) <= m_size) |
| { |
| if ((child_index < m_size) && (m_heap[child_index].m_priority < m_heap[child_index + 1].m_priority)) ++child_index; |
| if (orig_priority > m_heap[child_index].m_priority) |
| break; |
| m_heap[heap_index] = m_heap[child_index]; |
| heap_index = child_index; |
| } |
| |
| m_heap[heap_index].m_index = orig_index; |
| m_heap[heap_index].m_priority = orig_priority; |
| } |
| }; |
| |
| // Tree structured vector quantization (TSVQ) |
| |
| template <typename TrainingVectorType> |
| class tree_vector_quant |
| { |
| public: |
| typedef TrainingVectorType training_vec_type; |
| typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight; |
| typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs; |
| |
| tree_vector_quant() : |
| m_next_codebook_index(0) |
| { |
| } |
| |
| void clear() |
| { |
| clear_vector(m_training_vecs); |
| clear_vector(m_nodes); |
| m_next_codebook_index = 0; |
| } |
| |
| void add_training_vec(const TrainingVectorType &v, uint64_t weight) { m_training_vecs.push_back(std::make_pair(v, weight)); } |
| |
| size_t get_total_training_vecs() const { return m_training_vecs.size(); } |
| const array_of_weighted_training_vecs &get_training_vecs() const { return m_training_vecs; } |
| array_of_weighted_training_vecs &get_training_vecs() { return m_training_vecs; } |
| |
| void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const |
| { |
| for (uint32_t i = 0; i < m_nodes.size(); i++) |
| { |
| const tsvq_node &n = m_nodes[i]; |
| if (!n.is_leaf()) |
| continue; |
| |
| codebook.resize(codebook.size() + 1); |
| codebook.back() = n.m_training_vecs; |
| } |
| } |
| |
| void retrieve(basisu::vector<TrainingVectorType> &codebook) const |
| { |
| for (uint32_t i = 0; i < m_nodes.size(); i++) |
| { |
| const tsvq_node &n = m_nodes[i]; |
| if (!n.is_leaf()) |
| continue; |
| |
| codebook.resize(codebook.size() + 1); |
| codebook.back() = n.m_origin; |
| } |
| } |
| |
| void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const |
| { |
| uint_vec node_stack; |
| node_stack.reserve(512); |
| |
| codebook.resize(0); |
| codebook.reserve(max_clusters); |
| |
| uint32_t node_index = 0; |
| |
| while (true) |
| { |
| const tsvq_node& cur = m_nodes[node_index]; |
| |
| if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters)) |
| { |
| codebook.resize(codebook.size() + 1); |
| codebook.back() = cur.m_training_vecs; |
| |
| if (node_stack.empty()) |
| break; |
| |
| node_index = node_stack.back(); |
| node_stack.pop_back(); |
| continue; |
| } |
| |
| node_stack.push_back(cur.m_right_index); |
| node_index = cur.m_left_index; |
| } |
| } |
| |
| bool generate(uint32_t max_size) |
| { |
| if (!m_training_vecs.size()) |
| return false; |
| |
| m_next_codebook_index = 0; |
| |
| clear_vector(m_nodes); |
| m_nodes.reserve(max_size * 2 + 1); |
| |
| m_nodes.push_back(prepare_root()); |
| |
| priority_queue var_heap; |
| var_heap.init(max_size, 0, m_nodes[0].m_var); |
| |
| basisu::vector<uint32_t> l_children, r_children; |
| |
| // Now split the worst nodes |
| l_children.reserve(m_training_vecs.size() + 1); |
| r_children.reserve(m_training_vecs.size() + 1); |
| |
| uint32_t total_leaf_nodes = 1; |
| |
| //interval_timer tm; |
| //tm.start(); |
| |
| while ((var_heap.size()) && (total_leaf_nodes < max_size)) |
| { |
| const uint32_t node_index = var_heap.get_top_index(); |
| const tsvq_node &node = m_nodes[node_index]; |
| |
| assert(node.m_var == var_heap.get_top_priority()); |
| assert(node.is_leaf()); |
| |
| var_heap.delete_top(); |
| |
| if (node.m_training_vecs.size() > 1) |
| { |
| if (split_node(node_index, var_heap, l_children, r_children)) |
| { |
| // This removes one leaf node (making an internal node) and replaces it with two new leaves, so +1 total. |
| total_leaf_nodes += 1; |
| } |
| } |
| } |
| |
| //debug_printf("tree_vector_quant::generate %u: %3.3f secs\n", TrainingVectorType::num_elements, tm.get_elapsed_secs()); |
| |
| return true; |
| } |
| |
| private: |
| class tsvq_node |
| { |
| public: |
| inline tsvq_node() : m_weight(0), m_origin(cZero), m_left_index(-1), m_right_index(-1), m_codebook_index(-1) { } |
| |
| // vecs is erased |
| inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); } |
| |
| inline bool is_leaf() const { return m_left_index < 0; } |
| |
| float m_var; |
| uint64_t m_weight; |
| TrainingVectorType m_origin; |
| int32_t m_left_index, m_right_index; |
| basisu::vector<uint32_t> m_training_vecs; |
| int m_codebook_index; |
| }; |
| |
| typedef basisu::vector<tsvq_node> tsvq_node_vec; |
| tsvq_node_vec m_nodes; |
| |
| array_of_weighted_training_vecs m_training_vecs; |
| |
| uint32_t m_next_codebook_index; |
| |
| tsvq_node prepare_root() const |
| { |
| double ttsum = 0.0f; |
| |
| // Prepare root node containing all training vectors |
| tsvq_node root; |
| root.m_training_vecs.reserve(m_training_vecs.size()); |
| |
| for (uint32_t i = 0; i < m_training_vecs.size(); i++) |
| { |
| const TrainingVectorType &v = m_training_vecs[i].first; |
| const uint64_t weight = m_training_vecs[i].second; |
| |
| root.m_training_vecs.push_back(i); |
| |
| root.m_origin += (v * static_cast<float>(weight)); |
| root.m_weight += weight; |
| |
| ttsum += v.dot(v) * weight; |
| } |
| |
| root.m_var = static_cast<float>(ttsum - (root.m_origin.dot(root.m_origin) / root.m_weight)); |
| |
| root.m_origin *= (1.0f / root.m_weight); |
| |
| return root; |
| } |
| |
| bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children) |
| { |
| TrainingVectorType l_child_org, r_child_org; |
| uint64_t l_weight = 0, r_weight = 0; |
| float l_var = 0.0f, r_var = 0.0f; |
| |
| // Compute initial left/right child origins |
| if (!prep_split(m_nodes[node_index], l_child_org, r_child_org)) |
| return false; |
| |
| // Use k-means iterations to refine these children vectors |
| if (!refine_split(m_nodes[node_index], l_child_org, l_weight, l_var, l_children, r_child_org, r_weight, r_var, r_children)) |
| return false; |
| |
| // Create children |
| const uint32_t l_child_index = (uint32_t)m_nodes.size(), r_child_index = (uint32_t)m_nodes.size() + 1; |
| |
| m_nodes[node_index].m_left_index = l_child_index; |
| m_nodes[node_index].m_right_index = r_child_index; |
| |
| m_nodes[node_index].m_codebook_index = m_next_codebook_index; |
| m_next_codebook_index++; |
| |
| m_nodes.resize(m_nodes.size() + 2); |
| |
| tsvq_node &l_child = m_nodes[l_child_index], &r_child = m_nodes[r_child_index]; |
| |
| l_child.set(l_child_org, l_weight, l_var, l_children); |
| r_child.set(r_child_org, r_weight, r_var, r_children); |
| |
| if ((l_child.m_var <= 0.0f) && (l_child.m_training_vecs.size() > 1)) |
| { |
| TrainingVectorType v(m_training_vecs[l_child.m_training_vecs[0]].first); |
| |
| for (uint32_t i = 1; i < l_child.m_training_vecs.size(); i++) |
| { |
| if (!(v == m_training_vecs[l_child.m_training_vecs[i]].first)) |
| { |
| l_child.m_var = 1e-4f; |
| break; |
| } |
| } |
| } |
| |
| if ((r_child.m_var <= 0.0f) && (r_child.m_training_vecs.size() > 1)) |
| { |
| TrainingVectorType v(m_training_vecs[r_child.m_training_vecs[0]].first); |
| |
| for (uint32_t i = 1; i < r_child.m_training_vecs.size(); i++) |
| { |
| if (!(v == m_training_vecs[r_child.m_training_vecs[i]].first)) |
| { |
| r_child.m_var = 1e-4f; |
| break; |
| } |
| } |
| } |
| |
| if ((l_child.m_var > 0.0f) && (l_child.m_training_vecs.size() > 1)) |
| var_heap.add_heap(l_child_index, l_child.m_var); |
| |
| if ((r_child.m_var > 0.0f) && (r_child.m_training_vecs.size() > 1)) |
| var_heap.add_heap(r_child_index, r_child.m_var); |
| |
| return true; |
| } |
| |
| TrainingVectorType compute_split_axis(const tsvq_node &node) const |
| { |
| const uint32_t N = TrainingVectorType::num_elements; |
| |
| matrix<N, N, float> cmatrix; |
| |
| if ((N != 16) || (!g_cpu_supports_sse41)) |
| { |
| cmatrix.set_zero(); |
| |
| // Compute covariance matrix from weighted input vectors |
| for (uint32_t i = 0; i < node.m_training_vecs.size(); i++) |
| { |
| const TrainingVectorType v(m_training_vecs[node.m_training_vecs[i]].first - node.m_origin); |
| const TrainingVectorType w(static_cast<float>(m_training_vecs[node.m_training_vecs[i]].second) * v); |
| |
| for (uint32_t x = 0; x < N; x++) |
| for (uint32_t y = x; y < N; y++) |
| cmatrix[x][y] = cmatrix[x][y] + v[x] * w[y]; |
| } |
| } |
| else |
| { |
| #if BASISU_SUPPORT_SSE |
| // Specialize the case with 16x16 matrices, which are quite expensive without SIMD. |
| // This SSE function takes pointers to void types, so do some sanity checks. |
| assert(sizeof(TrainingVectorType) == sizeof(float) * 16); |
| assert(sizeof(training_vec_with_weight) == sizeof(std::pair<vec16F, uint64_t>)); |
| update_covar_matrix_16x16_sse41(node.m_training_vecs.size(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix); |
| #endif |
| } |
| |
| const float renorm_scale = 1.0f / node.m_weight; |
| |
| for (uint32_t x = 0; x < N; x++) |
| for (uint32_t y = x; y < N; y++) |
| cmatrix[x][y] *= renorm_scale; |
| |
| // Diagonal flip |
| for (uint32_t x = 0; x < (N - 1); x++) |
| for (uint32_t y = x + 1; y < N; y++) |
| cmatrix[y][x] = cmatrix[x][y]; |
| |
| return compute_pca_from_covar<N, TrainingVectorType>(cmatrix); |
| } |
| |
| bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const |
| { |
| //const uint32_t N = TrainingVectorType::num_elements; |
| |
| if (2 == node.m_training_vecs.size()) |
| { |
| l_child_result = m_training_vecs[node.m_training_vecs[0]].first; |
| r_child_result = m_training_vecs[node.m_training_vecs[1]].first; |
| return true; |
| } |
| |
| TrainingVectorType axis(compute_split_axis(node)), l_child(0.0f), r_child(0.0f); |
| double l_weight = 0.0f, r_weight = 0.0f; |
| |
| // Compute initial left/right children |
| for (uint32_t i = 0; i < node.m_training_vecs.size(); i++) |
| { |
| const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second; |
| |
| const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first; |
| |
| double t = (v - node.m_origin).dot(axis); |
| if (t >= 0.0f) |
| { |
| r_child += v * weight; |
| r_weight += weight; |
| } |
| else |
| { |
| l_child += v * weight; |
| l_weight += weight; |
| } |
| } |
| |
| if ((l_weight > 0.0f) && (r_weight > 0.0f)) |
| { |
| l_child_result = l_child * static_cast<float>(1.0f / l_weight); |
| r_child_result = r_child * static_cast<float>(1.0f / r_weight); |
| } |
| else |
| { |
| TrainingVectorType l(1e+20f); |
| TrainingVectorType h(-1e+20f); |
| for (uint32_t i = 0; i < node.m_training_vecs.size(); i++) |
| { |
| const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first; |
| |
| l = TrainingVectorType::component_min(l, v); |
| h = TrainingVectorType::component_max(h, v); |
| } |
| |
| TrainingVectorType r(h - l); |
| |
| float largest_axis_v = 0.0f; |
| int largest_axis_index = -1; |
| for (uint32_t i = 0; i < TrainingVectorType::num_elements; i++) |
| { |
| if (r[i] > largest_axis_v) |
| { |
| largest_axis_v = r[i]; |
| largest_axis_index = i; |
| } |
| } |
| |
| if (largest_axis_index < 0) |
| return false; |
| |
| basisu::vector<float> keys(node.m_training_vecs.size()); |
| for (uint32_t i = 0; i < node.m_training_vecs.size(); i++) |
| keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index]; |
| |
| uint_vec indices(node.m_training_vecs.size()); |
| indirect_sort((uint32_t)node.m_training_vecs.size(), &indices[0], &keys[0]); |
| |
| l_child.set_zero(); |
| l_weight = 0; |
| |
| r_child.set_zero(); |
| r_weight = 0; |
| |
| const uint32_t half_index = (uint32_t)node.m_training_vecs.size() / 2; |
| for (uint32_t i = 0; i < node.m_training_vecs.size(); i++) |
| { |
| const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second; |
| |
| const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first; |
| |
| if (i < half_index) |
| { |
| l_child += v * weight; |
| l_weight += weight; |
| } |
| else |
| { |
| r_child += v * weight; |
| r_weight += weight; |
| } |
| } |
| |
| if ((l_weight > 0.0f) && (r_weight > 0.0f)) |
| { |
| l_child_result = l_child * static_cast<float>(1.0f / l_weight); |
| r_child_result = r_child * static_cast<float>(1.0f / r_weight); |
| } |
| else |
| { |
| l_child_result = l; |
| r_child_result = h; |
| } |
| } |
| |
| return true; |
| } |
| |
| bool refine_split(const tsvq_node &node, |
| TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children, |
| TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const |
| { |
| l_children.reserve(node.m_training_vecs.size()); |
| r_children.reserve(node.m_training_vecs.size()); |
| |
| float prev_total_variance = 1e+10f; |
| |
| // Refine left/right children locations using k-means iterations |
| const uint32_t cMaxIters = 6; |
| for (uint32_t iter = 0; iter < cMaxIters; iter++) |
| { |
| l_children.resize(0); |
| r_children.resize(0); |
| |
| TrainingVectorType new_l_child(cZero), new_r_child(cZero); |
| |
| double l_ttsum = 0.0f, r_ttsum = 0.0f; |
| |
| l_weight = 0; |
| r_weight = 0; |
| |
| for (uint32_t i = 0; i < node.m_training_vecs.size(); i++) |
| { |
| const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first; |
| const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second; |
| |
| double left_dist2 = l_child.squared_distance_d(v), right_dist2 = r_child.squared_distance_d(v); |
| |
| if (left_dist2 >= right_dist2) |
| { |
| new_r_child += (v * static_cast<float>(weight)); |
| r_weight += weight; |
| |
| r_ttsum += weight * v.dot(v); |
| r_children.push_back(node.m_training_vecs[i]); |
| } |
| else |
| { |
| new_l_child += (v * static_cast<float>(weight)); |
| l_weight += weight; |
| |
| l_ttsum += weight * v.dot(v); |
| l_children.push_back(node.m_training_vecs[i]); |
| } |
| } |
| |
| // Node is unsplittable using the above algorithm - try something else to split it up. |
| if ((!l_weight) || (!r_weight)) |
| { |
| l_children.resize(0); |
| new_l_child.set(0.0f); |
| l_ttsum = 0.0f; |
| l_weight = 0; |
| |
| r_children.resize(0); |
| new_r_child.set(0.0f); |
| r_ttsum = 0.0f; |
| r_weight = 0; |
| |
| TrainingVectorType firstVec; |
| for (uint32_t i = 0; i < node.m_training_vecs.size(); i++) |
| { |
| const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first; |
| const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second; |
| |
| if ((!i) || (v == firstVec)) |
| { |
| firstVec = v; |
| |
| new_r_child += (v * static_cast<float>(weight)); |
| r_weight += weight; |
| |
| r_ttsum += weight * v.dot(v); |
| r_children.push_back(node.m_training_vecs[i]); |
| } |
| else |
| { |
| new_l_child += (v * static_cast<float>(weight)); |
| l_weight += weight; |
| |
| l_ttsum += weight * v.dot(v); |
| l_children.push_back(node.m_training_vecs[i]); |
| } |
| } |
| |
| if ((!l_weight) || (!r_weight)) |
| return false; |
| } |
| |
| l_var = static_cast<float>(l_ttsum - (new_l_child.dot(new_l_child) / l_weight)); |
| r_var = static_cast<float>(r_ttsum - (new_r_child.dot(new_r_child) / r_weight)); |
| |
| new_l_child *= (1.0f / l_weight); |
| new_r_child *= (1.0f / r_weight); |
| |
| l_child = new_l_child; |
| r_child = new_r_child; |
| |
| float total_var = l_var + r_var; |
| const float cGiveupVariance = .00001f; |
| if (total_var < cGiveupVariance) |
| break; |
| |
| // Check to see if the variance has settled |
| const float cVarianceDeltaThresh = .00125f; |
| if (((prev_total_variance - total_var) / total_var) < cVarianceDeltaThresh) |
| break; |
| |
| prev_total_variance = total_var; |
| } |
| |
| return true; |
| } |
| }; |
| |
| struct weighted_block_group |
| { |
| uint64_t m_total_weight; |
| uint_vec m_indices; |
| }; |
| |
| template<typename Quantizer> |
| bool generate_hierarchical_codebook_threaded_internal(Quantizer& q, |
| uint32_t max_codebook_size, uint32_t max_parent_codebook_size, |
| basisu::vector<uint_vec>& codebook, |
| basisu::vector<uint_vec>& parent_codebook, |
| uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool) |
| { |
| codebook.resize(0); |
| parent_codebook.resize(0); |
| |
| if ((max_threads <= 1) || (q.get_training_vecs().size() < 256) || (max_codebook_size < max_threads * 16)) |
| { |
| if (!q.generate(max_codebook_size)) |
| return false; |
| |
| q.retrieve(codebook); |
| |
| if (max_parent_codebook_size) |
| q.retrieve(max_parent_codebook_size, parent_codebook); |
| |
| return true; |
| } |
| |
| const uint32_t cMaxThreads = 16; |
| if (max_threads > cMaxThreads) |
| max_threads = cMaxThreads; |
| |
| if (!q.generate(max_threads)) |
| return false; |
| |
| basisu::vector<uint_vec> initial_codebook; |
| |
| q.retrieve(initial_codebook); |
| |
| if (initial_codebook.size() < max_threads) |
| { |
| codebook = initial_codebook; |
| |
| if (max_parent_codebook_size) |
| q.retrieve(max_parent_codebook_size, parent_codebook); |
| |
| return true; |
| } |
| |
| Quantizer quantizers[cMaxThreads]; |
| |
| bool success_flags[cMaxThreads]; |
| clear_obj(success_flags); |
| |
| basisu::vector<uint_vec> local_clusters[cMaxThreads]; |
| basisu::vector<uint_vec> local_parent_clusters[cMaxThreads]; |
| |
| for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++) |
| { |
| #ifndef __EMSCRIPTEN__ |
| pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] { |
| #endif |
| |
| Quantizer& lq = quantizers[thread_iter]; |
| uint_vec& cluster_indices = initial_codebook[thread_iter]; |
| |
| uint_vec local_to_global(cluster_indices.size()); |
| |
| for (uint32_t i = 0; i < cluster_indices.size(); i++) |
| { |
| const uint32_t global_training_vec_index = cluster_indices[i]; |
| local_to_global[i] = global_training_vec_index; |
| |
| lq.add_training_vec(q.get_training_vecs()[global_training_vec_index].first, q.get_training_vecs()[global_training_vec_index].second); |
| } |
| |
| const uint32_t max_clusters = limit_clusterizers ? ((max_codebook_size + max_threads - 1) / max_threads) : (uint32_t)lq.get_total_training_vecs(); |
| |
| success_flags[thread_iter] = lq.generate(max_clusters); |
| |
| if (success_flags[thread_iter]) |
| { |
| lq.retrieve(local_clusters[thread_iter]); |
| |
| for (uint32_t i = 0; i < local_clusters[thread_iter].size(); i++) |
| { |
| for (uint32_t j = 0; j < local_clusters[thread_iter][i].size(); j++) |
| local_clusters[thread_iter][i][j] = local_to_global[local_clusters[thread_iter][i][j]]; |
| } |
| |
| if (max_parent_codebook_size) |
| { |
| lq.retrieve((max_parent_codebook_size + max_threads - 1) / max_threads, local_parent_clusters[thread_iter]); |
| |
| for (uint32_t i = 0; i < local_parent_clusters[thread_iter].size(); i++) |
| { |
| for (uint32_t j = 0; j < local_parent_clusters[thread_iter][i].size(); j++) |
| local_parent_clusters[thread_iter][i][j] = local_to_global[local_parent_clusters[thread_iter][i][j]]; |
| } |
| } |
| } |
| |
| #ifndef __EMSCRIPTEN__ |
| } ); |
| #endif |
| |
| } // thread_iter |
| |
| #ifndef __EMSCRIPTEN__ |
| pJob_pool->wait_for_all(); |
| #endif |
| |
| uint32_t total_clusters = 0, total_parent_clusters = 0; |
| |
| for (int thread_iter = 0; thread_iter < (int)max_threads; thread_iter++) |
| { |
| if (!success_flags[thread_iter]) |
| return false; |
| total_clusters += (uint32_t)local_clusters[thread_iter].size(); |
| total_parent_clusters += (uint32_t)local_parent_clusters[thread_iter].size(); |
| } |
| |
| codebook.reserve(total_clusters); |
| parent_codebook.reserve(total_parent_clusters); |
| |
| for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++) |
| { |
| for (uint32_t j = 0; j < local_clusters[thread_iter].size(); j++) |
| { |
| codebook.resize(codebook.size() + 1); |
| codebook.back().swap(local_clusters[thread_iter][j]); |
| } |
| |
| for (uint32_t j = 0; j < local_parent_clusters[thread_iter].size(); j++) |
| { |
| parent_codebook.resize(parent_codebook.size() + 1); |
| parent_codebook.back().swap(local_parent_clusters[thread_iter][j]); |
| } |
| } |
| |
| return true; |
| } |
| |
| template<typename Quantizer> |
| bool generate_hierarchical_codebook_threaded(Quantizer& q, |
| uint32_t max_codebook_size, uint32_t max_parent_codebook_size, |
| basisu::vector<uint_vec>& codebook, |
| basisu::vector<uint_vec>& parent_codebook, |
| uint32_t max_threads, job_pool *pJob_pool, |
| bool even_odd_input_pairs_equal) |
| { |
| typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher; |
| |
| typedef std::unordered_map < typename Quantizer::training_vec_type, weighted_block_group, |
| training_vec_bit_hasher> group_hash; |
| |
| //interval_timer tm; |
| //tm.start(); |
| |
| group_hash unique_vecs; |
| |
| unique_vecs.reserve(20000); |
| |
| weighted_block_group g; |
| |
| if (even_odd_input_pairs_equal) |
| { |
| g.m_indices.resize(2); |
| |
| assert(q.get_training_vecs().size() >= 2 && (q.get_training_vecs().size() & 1) == 0); |
| |
| for (uint32_t i = 0; i < q.get_training_vecs().size(); i += 2) |
| { |
| assert(q.get_training_vecs()[i].first == q.get_training_vecs()[i + 1].first); |
| |
| g.m_total_weight = q.get_training_vecs()[i].second + q.get_training_vecs()[i + 1].second; |
| g.m_indices[0] = i; |
| g.m_indices[1] = i + 1; |
| |
| auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g)); |
| |
| if (!ins_res.second) |
| { |
| (ins_res.first)->second.m_total_weight += g.m_total_weight; |
| (ins_res.first)->second.m_indices.push_back(i); |
| (ins_res.first)->second.m_indices.push_back(i + 1); |
| } |
| } |
| } |
| else |
| { |
| g.m_indices.resize(1); |
| |
| for (uint32_t i = 0; i < q.get_training_vecs().size(); i++) |
| { |
| g.m_total_weight = q.get_training_vecs()[i].second; |
| g.m_indices[0] = i; |
| |
| auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g)); |
| |
| if (!ins_res.second) |
| { |
| (ins_res.first)->second.m_total_weight += g.m_total_weight; |
| (ins_res.first)->second.m_indices.push_back(i); |
| } |
| } |
| } |
| |
| //debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors, %3.3f secs\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size(), tm.get_elapsed_secs()); |
| debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size()); |
| |
| Quantizer group_quant; |
| typedef typename group_hash::const_iterator group_hash_const_iter; |
| basisu::vector<group_hash_const_iter> unique_vec_iters; |
| unique_vec_iters.reserve(unique_vecs.size()); |
| |
| for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter) |
| { |
| group_quant.add_training_vec(iter->first, iter->second.m_total_weight); |
| unique_vec_iters.push_back(iter); |
| } |
| |
| bool limit_clusterizers = true; |
| if (unique_vecs.size() <= max_codebook_size) |
| limit_clusterizers = false; |
| |
| debug_printf("Limit clusterizers: %u\n", limit_clusterizers); |
| |
| basisu::vector<uint_vec> group_codebook, group_parent_codebook; |
| bool status = generate_hierarchical_codebook_threaded_internal(group_quant, |
| max_codebook_size, max_parent_codebook_size, |
| group_codebook, |
| group_parent_codebook, |
| (unique_vecs.size() < 65536*4) ? 1 : max_threads, limit_clusterizers, pJob_pool); |
| |
| if (!status) |
| return false; |
| |
| codebook.resize(0); |
| for (uint32_t i = 0; i < group_codebook.size(); i++) |
| { |
| codebook.resize(codebook.size() + 1); |
| |
| for (uint32_t j = 0; j < group_codebook[i].size(); j++) |
| { |
| const uint32_t group_index = group_codebook[i][j]; |
| |
| typename group_hash::const_iterator group_iter = unique_vec_iters[group_index]; |
| const uint_vec& training_vec_indices = group_iter->second.m_indices; |
| |
| append_vector(codebook.back(), training_vec_indices); |
| } |
| } |
| |
| parent_codebook.resize(0); |
| for (uint32_t i = 0; i < group_parent_codebook.size(); i++) |
| { |
| parent_codebook.resize(parent_codebook.size() + 1); |
| |
| for (uint32_t j = 0; j < group_parent_codebook[i].size(); j++) |
| { |
| const uint32_t group_index = group_parent_codebook[i][j]; |
| |
| typename group_hash::const_iterator group_iter = unique_vec_iters[group_index]; |
| const uint_vec& training_vec_indices = group_iter->second.m_indices; |
| |
| append_vector(parent_codebook.back(), training_vec_indices); |
| } |
| } |
| |
| return true; |
| } |
| |
| // Canonical Huffman coding |
| |
| class histogram |
| { |
| basisu::vector<uint32_t> m_hist; |
| |
| public: |
| histogram(uint32_t size = 0) { init(size); } |
| |
| void clear() |
| { |
| clear_vector(m_hist); |
| } |
| |
| void init(uint32_t size) |
| { |
| m_hist.resize(0); |
| m_hist.resize(size); |
| } |
| |
| inline uint32_t size() const { return static_cast<uint32_t>(m_hist.size()); } |
| |
| inline const uint32_t &operator[] (uint32_t index) const |
| { |
| return m_hist[index]; |
| } |
| |
| inline uint32_t &operator[] (uint32_t index) |
| { |
| return m_hist[index]; |
| } |
| |
| inline void inc(uint32_t index) |
| { |
| m_hist[index]++; |
| } |
| |
| uint64_t get_total() const |
| { |
| uint64_t total = 0; |
| for (uint32_t i = 0; i < m_hist.size(); ++i) |
| total += m_hist[i]; |
| return total; |
| } |
| |
| double get_entropy() const |
| { |
| double total = static_cast<double>(get_total()); |
| if (total == 0.0f) |
| return 0.0f; |
| |
| const double inv_total = 1.0f / total; |
| const double neg_inv_log2 = -1.0f / log(2.0f); |
| |
| double e = 0.0f; |
| for (uint32_t i = 0; i < m_hist.size(); i++) |
| if (m_hist[i]) |
| e += log(m_hist[i] * inv_total) * neg_inv_log2 * static_cast<double>(m_hist[i]); |
| |
| return e; |
| } |
| }; |
| |
| struct sym_freq |
| { |
| uint32_t m_key; |
| uint16_t m_sym_index; |
| }; |
| |
| sym_freq *canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq *pSyms0, sym_freq *pSyms1); |
| void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms); |
| void canonical_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size); |
| |
| class huffman_encoding_table |
| { |
| public: |
| huffman_encoding_table() |
| { |
| } |
| |
| void clear() |
| { |
| clear_vector(m_codes); |
| clear_vector(m_code_sizes); |
| } |
| |
| bool init(const histogram &h, uint32_t max_code_size = cHuffmanMaxSupportedCodeSize) |
| { |
| return init(h.size(), &h[0], max_code_size); |
| } |
| |
| bool init(uint32_t num_syms, const uint16_t *pFreq, uint32_t max_code_size); |
| bool init(uint32_t num_syms, const uint32_t *pSym_freq, uint32_t max_code_size); |
| |
| inline const uint16_vec &get_codes() const { return m_codes; } |
| inline const uint8_vec &get_code_sizes() const { return m_code_sizes; } |
| |
| uint32_t get_total_used_codes() const |
| { |
| for (int i = static_cast<int>(m_code_sizes.size()) - 1; i >= 0; i--) |
| if (m_code_sizes[i]) |
| return i + 1; |
| return 0; |
| } |
| |
| private: |
| uint16_vec m_codes; |
| uint8_vec m_code_sizes; |
| }; |
| |
| class bitwise_coder |
| { |
| public: |
| bitwise_coder() : |
| m_bit_buffer(0), |
| m_bit_buffer_size(0), |
| m_total_bits(0) |
| { |
| } |
| |
| inline void clear() |
| { |
| clear_vector(m_bytes); |
| m_bit_buffer = 0; |
| m_bit_buffer_size = 0; |
| m_total_bits = 0; |
| } |
| |
| inline const uint8_vec &get_bytes() const { return m_bytes; } |
| |
| inline uint64_t get_total_bits() const { return m_total_bits; } |
| inline void clear_total_bits() { m_total_bits = 0; } |
| |
| inline void init(uint32_t reserve_size = 1024) |
| { |
| m_bytes.reserve(reserve_size); |
| m_bytes.resize(0); |
| |
| m_bit_buffer = 0; |
| m_bit_buffer_size = 0; |
| m_total_bits = 0; |
| } |
| |
| inline uint32_t flush() |
| { |
| if (m_bit_buffer_size) |
| { |
| m_total_bits += 8 - (m_bit_buffer_size & 7); |
| append_byte(static_cast<uint8_t>(m_bit_buffer)); |
| |
| m_bit_buffer = 0; |
| m_bit_buffer_size = 0; |
| |
| return 8; |
| } |
| |
| return 0; |
| } |
| |
| inline uint32_t put_bits(uint32_t bits, uint32_t num_bits) |
| { |
| assert(num_bits <= 32); |
| assert(bits < (1ULL << num_bits)); |
| |
| if (!num_bits) |
| return 0; |
| |
| m_total_bits += num_bits; |
| |
| uint64_t v = (static_cast<uint64_t>(bits) << m_bit_buffer_size) | m_bit_buffer; |
| m_bit_buffer_size += num_bits; |
| |
| while (m_bit_buffer_size >= 8) |
| { |
| append_byte(static_cast<uint8_t>(v)); |
| v >>= 8; |
| m_bit_buffer_size -= 8; |
| } |
| |
| m_bit_buffer = static_cast<uint8_t>(v); |
| return num_bits; |
| } |
| |
| inline uint32_t put_code(uint32_t sym, const huffman_encoding_table &tab) |
| { |
| uint32_t code = tab.get_codes()[sym]; |
| uint32_t code_size = tab.get_code_sizes()[sym]; |
| assert(code_size >= 1); |
| put_bits(code, code_size); |
| return code_size; |
| } |
| |
| inline uint32_t put_truncated_binary(uint32_t v, uint32_t n) |
| { |
| assert((n >= 2) && (v < n)); |
| |
| uint32_t k = floor_log2i(n); |
| uint32_t u = (1 << (k + 1)) - n; |
| |
| if (v < u) |
| return put_bits(v, k); |
| |
| uint32_t x = v + u; |
| assert((x >> 1) >= u); |
| |
| put_bits(x >> 1, k); |
| put_bits(x & 1, 1); |
| return k + 1; |
| } |
| |
| inline uint32_t put_rice(uint32_t v, uint32_t m) |
| { |
| assert(m); |
| |
| const uint64_t start_bits = m_total_bits; |
| |
| uint32_t q = v >> m, r = v & ((1 << m) - 1); |
| |
| // rice coding sanity check |
| assert(q <= 64); |
| |
| for (; q > 16; q -= 16) |
| put_bits(0xFFFF, 16); |
| |
| put_bits((1 << q) - 1, q); |
| put_bits(r << 1, m + 1); |
| |
| return (uint32_t)(m_total_bits - start_bits); |
| } |
| |
| inline uint32_t put_vlc(uint32_t v, uint32_t chunk_bits) |
| { |
| assert(chunk_bits); |
| |
| const uint32_t chunk_size = 1 << chunk_bits; |
| const uint32_t chunk_mask = chunk_size - 1; |
| |
| uint32_t total_bits = 0; |
| |
| for ( ; ; ) |
| { |
| uint32_t next_v = v >> chunk_bits; |
| |
| total_bits += put_bits((v & chunk_mask) | (next_v ? chunk_size : 0), chunk_bits + 1); |
| if (!next_v) |
| break; |
| |
| v = next_v; |
| } |
| |
| return total_bits; |
| } |
| |
| uint32_t emit_huffman_table(const huffman_encoding_table &tab); |
| |
| private: |
| uint8_vec m_bytes; |
| uint32_t m_bit_buffer, m_bit_buffer_size; |
| uint64_t m_total_bits; |
| |
| void append_byte(uint8_t c) |
| { |
| m_bytes.resize(m_bytes.size() + 1); |
| m_bytes.back() = c; |
| } |
| |
| static void end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len); |
| static void end_zero_run(uint16_vec &syms, uint32_t &run_size); |
| }; |
| |
| class huff2D |
| { |
| public: |
| huff2D() { } |
| huff2D(uint32_t bits_per_sym, uint32_t total_syms_per_group) { init(bits_per_sym, total_syms_per_group); } |
| |
| inline const histogram &get_histogram() const { return m_histogram; } |
| inline const huffman_encoding_table &get_encoding_table() const { return m_encoding_table; } |
| |
| inline void init(uint32_t bits_per_sym, uint32_t total_syms_per_group) |
| { |
| assert((bits_per_sym * total_syms_per_group) <= 16 && total_syms_per_group >= 1 && bits_per_sym >= 1); |
| |
| m_bits_per_sym = bits_per_sym; |
| m_total_syms_per_group = total_syms_per_group; |
| m_cur_sym_bits = 0; |
| m_cur_num_syms = 0; |
| m_decode_syms_remaining = 0; |
| m_next_decoder_group_index = 0; |
| |
| m_histogram.init(1 << (bits_per_sym * total_syms_per_group)); |
| } |
| |
| inline void clear() |
| { |
| m_group_bits.clear(); |
| |
| m_cur_sym_bits = 0; |
| m_cur_num_syms = 0; |
| m_decode_syms_remaining = 0; |
| m_next_decoder_group_index = 0; |
| } |
| |
| inline void emit(uint32_t sym) |
| { |
| m_cur_sym_bits |= (sym << (m_cur_num_syms * m_bits_per_sym)); |
| m_cur_num_syms++; |
| |
| if (m_cur_num_syms == m_total_syms_per_group) |
| flush(); |
| } |
| |
| inline void flush() |
| { |
| if (m_cur_num_syms) |
| { |
| m_group_bits.push_back(m_cur_sym_bits); |
| m_histogram.inc(m_cur_sym_bits); |
| |
| m_cur_sym_bits = 0; |
| m_cur_num_syms = 0; |
| } |
| } |
| |
| inline bool start_encoding(uint32_t code_size_limit = 16) |
| { |
| flush(); |
| |
| if (!m_encoding_table.init(m_histogram, code_size_limit)) |
| return false; |
| |
| m_decode_syms_remaining = 0; |
| m_next_decoder_group_index = 0; |
| |
| return true; |
| } |
| |
| inline uint32_t emit_next_sym(bitwise_coder &c) |
| { |
| uint32_t bits = 0; |
| |
| if (!m_decode_syms_remaining) |
| { |
| bits = c.put_code(m_group_bits[m_next_decoder_group_index++], m_encoding_table); |
| m_decode_syms_remaining = m_total_syms_per_group; |
| } |
| |
| m_decode_syms_remaining--; |
| return bits; |
| } |
| |
| inline void emit_flush() |
| { |
| m_decode_syms_remaining = 0; |
| } |
| |
| private: |
| uint_vec m_group_bits; |
| huffman_encoding_table m_encoding_table; |
| histogram m_histogram; |
| uint32_t m_bits_per_sym, m_total_syms_per_group, m_cur_sym_bits, m_cur_num_syms, m_next_decoder_group_index, m_decode_syms_remaining; |
| }; |
| |
| bool huffman_test(int rand_seed); |
| |
| // VQ index reordering |
| |
| class palette_index_reorderer |
| { |
| public: |
| palette_index_reorderer() |
| { |
| } |
| |
| void clear() |
| { |
| clear_vector(m_hist); |
| clear_vector(m_total_count_to_picked); |
| clear_vector(m_entries_picked); |
| clear_vector(m_entries_to_do); |
| clear_vector(m_remap_table); |
| } |
| |
| // returns [0,1] distance of entry i to entry j |
| typedef float(*pEntry_dist_func)(uint32_t i, uint32_t j, void *pCtx); |
| |
| void init(uint32_t num_indices, const uint32_t *pIndices, uint32_t num_syms, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight); |
| |
| // Table remaps old to new symbol indices |
| inline const uint_vec &get_remap_table() const { return m_remap_table; } |
| |
| private: |
| uint_vec m_hist, m_total_count_to_picked, m_entries_picked, m_entries_to_do, m_remap_table; |
| |
| inline uint32_t get_hist(int i, int j, int n) const { return (i > j) ? m_hist[j * n + i] : m_hist[i * n + j]; } |
| inline void inc_hist(int i, int j, int n) { if ((i != j) && (i < j) && (i != -1) && (j != -1)) { assert(((uint32_t)i < (uint32_t)n) && ((uint32_t)j < (uint32_t)n)); m_hist[i * n + j]++; } } |
| |
| void prepare_hist(uint32_t num_syms, uint32_t num_indices, const uint32_t *pIndices); |
| void find_initial(uint32_t num_syms); |
| void find_next_entry(uint32_t &best_entry, double &best_count, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight); |
| float pick_side(uint32_t num_syms, uint32_t entry_to_move, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight); |
| }; |
| |
| // Simple 32-bit 2D image class |
| |
| class image |
| { |
| public: |
| image() : |
| m_width(0), m_height(0), m_pitch(0) |
| { |
| } |
| |
| image(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) : |
| m_width(0), m_height(0), m_pitch(0) |
| { |
| resize(w, h, p); |
| } |
| |
| image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) : |
| m_width(0), m_height(0), m_pitch(0) |
| { |
| init(pImage, width, height, comps); |
| } |
| |
| image(const image &other) : |
| m_width(0), m_height(0), m_pitch(0) |
| { |
| *this = other; |
| } |
| |
| image &swap(image &other) |
| { |
| std::swap(m_width, other.m_width); |
| std::swap(m_height, other.m_height); |
| std::swap(m_pitch, other.m_pitch); |
| m_pixels.swap(other.m_pixels); |
| return *this; |
| } |
| |
| image &operator= (const image &rhs) |
| { |
| if (this != &rhs) |
| { |
| m_width = rhs.m_width; |
| m_height = rhs.m_height; |
| m_pitch = rhs.m_pitch; |
| m_pixels = rhs.m_pixels; |
| } |
| return *this; |
| } |
| |
| image &clear() |
| { |
| m_width = 0; |
| m_height = 0; |
| m_pitch = 0; |
| clear_vector(m_pixels); |
| return *this; |
| } |
| |
| image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color) |
| { |
| return crop(w, h, p, background); |
| } |
| |
| image &set_all(const color_rgba &c) |
| { |
| for (uint32_t i = 0; i < m_pixels.size(); i++) |
| m_pixels[i] = c; |
| return *this; |
| } |
| |
| void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) |
| { |
| assert(comps >= 1 && comps <= 4); |
| |
| resize(width, height); |
| |
| for (uint32_t y = 0; y < height; y++) |
| { |
| for (uint32_t x = 0; x < width; x++) |
| { |
| const uint8_t *pSrc = &pImage[(x + y * width) * comps]; |
| color_rgba &dst = (*this)(x, y); |
| |
| if (comps == 1) |
| { |
| dst.r = pSrc[0]; |
| dst.g = pSrc[0]; |
| dst.b = pSrc[0]; |
| dst.a = 255; |
| } |
| else if (comps == 2) |
| { |
| dst.r = pSrc[0]; |
| dst.g = pSrc[0]; |
| dst.b = pSrc[0]; |
| dst.a = pSrc[1]; |
| } |
| else |
| { |
| dst.r = pSrc[0]; |
| dst.g = pSrc[1]; |
| dst.b = pSrc[2]; |
| if (comps == 4) |
| dst.a = pSrc[3]; |
| else |
| dst.a = 255; |
| } |
| } |
| } |