| // File: basisu_astc_hdr_6x6_enc.cpp |
| // Copyright (C) 2019-2026 Binomial LLC. All Rights Reserved. |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| #include "basisu_astc_hdr_6x6_enc.h" |
| #include "basisu_enc.h" |
| #include "basisu_astc_hdr_common.h" |
| #include "basisu_math.h" |
| #include "basisu_resampler.h" |
| #include "basisu_resampler_filters.h" |
| |
| #define MINIZ_HEADER_FILE_ONLY |
| #define MINIZ_NO_ZLIB_COMPATIBLE_NAMES |
| #include "basisu_miniz.h" |
| |
| #include "3rdparty/android_astc_decomp.h" |
| |
| #include <array> |
| #include <cfloat> |
| |
| using namespace basisu; |
| using namespace buminiz; |
| using namespace basist::astc_6x6_hdr; |
| |
| namespace astc_6x6_hdr |
| { |
| |
| static void atomic_max(std::atomic<uint32_t>& atomic_var, uint32_t new_value) |
| { |
| uint32_t current = atomic_var.load(std::memory_order_relaxed); |
| for ( ; ; ) |
| { |
| uint32_t new_max = std::max(current, new_value); |
| if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed)) |
| break; |
| } |
| } |
| |
| void astc_hdr_6x6_global_config::set_user_level(int level) |
| { |
| level = basisu::clamp<int>(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL); |
| |
| m_master_comp_level = 0; |
| m_highest_comp_level = 0; |
| m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS; |
| m_extra_patterns_flag = false; |
| m_brute_force_partition_matching = false; |
| |
| switch (level) |
| { |
| case 0: |
| { |
| // Both reduce compression a lot when lambda>0 |
| m_favor_higher_compression = false; |
| m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2; |
| break; |
| } |
| case 1: |
| { |
| m_master_comp_level = 0; |
| m_highest_comp_level = 0; |
| break; |
| } |
| case 2: |
| { |
| m_master_comp_level = 0; |
| m_highest_comp_level = 1; |
| break; |
| } |
| case 3: |
| { |
| m_master_comp_level = 1; |
| m_highest_comp_level = 1; |
| break; |
| } |
| case 4: |
| { |
| m_master_comp_level = 1; |
| m_highest_comp_level = 2; |
| break; |
| } |
| case 5: |
| { |
| m_master_comp_level = 1; |
| m_highest_comp_level = 3; |
| break; |
| } |
| case 6: |
| { |
| m_master_comp_level = 1; |
| m_highest_comp_level = 4; |
| break; |
| } |
| case 7: |
| { |
| m_master_comp_level = 2; |
| m_highest_comp_level = 2; |
| break; |
| } |
| case 8: |
| { |
| m_master_comp_level = 2; |
| m_highest_comp_level = 3; |
| break; |
| } |
| case 9: |
| { |
| m_master_comp_level = 2; |
| m_highest_comp_level = 4; |
| break; |
| } |
| case 10: |
| { |
| m_master_comp_level = 3; |
| m_highest_comp_level = 3; |
| break; |
| } |
| case 11: |
| { |
| m_master_comp_level = 3; |
| m_highest_comp_level = 4; |
| break; |
| } |
| case 12: |
| default: |
| { |
| m_master_comp_level = 4; |
| m_highest_comp_level = 4; |
| m_extra_patterns_flag = true; |
| m_brute_force_partition_matching = true; |
| break; |
| } |
| } |
| } |
| |
| const float m1 = 0.1593017578125f; // (2610 / 2^14) * (1/100) |
| const float m2 = 78.84375f; // (2523 / 32) * (1/100) |
| const float c1 = 0.8359375f; // 3424 / (2^12) |
| const float c2 = 18.8515625f; // (2413 / 128) |
| const float c3 = 18.6875f; // (2392 / 128) |
| |
| static float forwardPQ(float Y) |
| { |
| // 10,000 here is an absolute scale - it's in nits (cd per square meter) |
| float L = Y * (1.0f / 10000.0f); |
| |
| float num = powf(L, m1); |
| float N = powf((c1 + c2 * num) / (1 + c3 * num), m2); |
| |
| return N; |
| } |
| |
| #if 0 |
| static float inversePQ(float E) |
| { |
| float N = powf(E, 1.0f / m2); |
| |
| float num = basisu::maximum<float>((N - c1), 0.0f) / (c2 - c3 * N); |
| float L = powf(num, 1.0f / m1); |
| |
| return L * 10000.0f; |
| } |
| #endif |
| |
| // PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries. |
| // max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86 |
| // Highest error is for values less than SMALLEST_PQ_VAL_IN. |
| // |
| // Approximation is round trip lossless for 10-12 bits at [0,10000] nits: |
| // for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096): |
| // round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x |
| // |
| // bfloat16 has enough precision to handle 8-bit sRGB to linear conversions: |
| // round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless |
| |
| const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16; |
| const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1); |
| |
| const float SMALLEST_PQ_VAL_IN = 0.000015258829080f; |
| const float SMALLEST_PQ_VAL = 0.000551903737f; // forwardPQ(SMALLEST_PQ_VAL_IN) |
| |
| const float LARGEST_PQ_VAL = 1.251312f; |
| |
| float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128]; |
| |
| static void init_pq_tables() |
| { |
| for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++) |
| { |
| for (int mant = 0; mant < 128; mant++) |
| { |
| bfloat16 b = bfloat16_init(1, exp, mant); |
| float bf = bfloat16_to_float(b); |
| |
| float pq = forwardPQ(bf); |
| |
| g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq; |
| } |
| } |
| |
| //fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0])); |
| //fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN)); |
| } |
| |
| static inline float forwardPQTab(float v) |
| { |
| assert(g_pq_approx_tabs[0][0]); |
| |
| assert(v >= 0.0f); |
| if (v == 0.0f) |
| return 0.0f; |
| |
| bfloat16 bf = float_to_bfloat16(v, false); |
| assert(v >= bfloat16_to_float(bf)); |
| |
| int exp = bfloat16_get_exp(bf); |
| |
| if (exp < PQ_APPROX_MIN_EXP) |
| { |
| // not accurate but should be good enough for our uses |
| return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN)); |
| } |
| else if (exp > PQ_APPROX_MAX_EXP) |
| return LARGEST_PQ_VAL; |
| |
| int mant = bfloat16_get_mantissa(bf); |
| |
| float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant]; |
| float bf_f32 = bfloat16_to_float(bf); |
| |
| int next_mant = mant + 1; |
| int next_exp = exp; |
| if (next_mant == 128) |
| { |
| next_mant = 0; |
| next_exp++; |
| if (next_exp > PQ_APPROX_MAX_EXP) |
| return a; |
| } |
| |
| float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant]; |
| |
| bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant); |
| float next_bf_f32 = bfloat16_to_float(next_bf); |
| assert(v <= next_bf_f32); |
| |
| float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32); |
| assert((lerp_factor >= 0) && (lerp_factor <= 1.0f)); |
| |
| return lerp(a, b, lerp_factor); |
| } |
| |
| // 100 nits = ~.5 i |
| // This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2. |
| // To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true). |
| // Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true. |
| // |
| // ITP info: |
| // https://www.portrait.com/resource-center/ictcp-color-difference-metric/ |
| // https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's) |
| // This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP. |
| // |
| // Linear REC709 to REC2020/BT.2100 gamut conversion: |
| // rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f; |
| // rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f; |
| // rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f; |
| // const float S = 1.0f / 4096.0f; |
| // l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2]; |
| // m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2]; |
| // s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2]; |
| static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false) |
| { |
| vec3F rgb_2100(rgb_in); |
| |
| float l, m, s; |
| if (!rec2020_bt2100_color_gamut) |
| { |
| // Assume REC 709 input color gamut |
| // (REC2020_to_LMS * REC709_to_2020) * input_color |
| l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f; |
| m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f; |
| s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f; |
| } |
| else |
| { |
| // Assumes REC2020/BT.2100 input color gamut (this is from the spec) |
| l = 0.412109375f * rgb_2100[0] + 0.52392578125f * rgb_2100[1] + 0.06396484375f * rgb_2100[2]; |
| m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2]; |
| s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f * rgb_2100[2]; |
| } |
| |
| float ld = forwardPQTab(l); |
| float md = forwardPQTab(m); |
| float sd = forwardPQTab(s); |
| |
| ictcp[0] = .5f * ld + .5f * md; |
| |
| // if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear) |
| if (itp_flag) |
| ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd; |
| else |
| ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd; |
| |
| ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd; |
| } |
| |
| static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg) |
| { |
| linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut); |
| } |
| |
| #if 0 |
| // Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut). |
| static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false) |
| { |
| float ct = ictcp[1]; |
| |
| if (itp_flag) |
| ct *= 2.0f; |
| |
| float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f; |
| float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f; |
| float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f; |
| |
| float l = inversePQ(ld); |
| float m = inversePQ(md); |
| float s = inversePQ(sd); |
| |
| rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f; |
| rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f; |
| rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f; |
| } |
| #endif |
| |
| struct half_vec3 |
| { |
| basist::half_float m_vals[3]; |
| |
| inline half_vec3() { } |
| |
| inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z) |
| { |
| m_vals[0] = x; |
| m_vals[1] = y; |
| m_vals[2] = z; |
| } |
| |
| inline half_vec3(const half_vec3& other) |
| { |
| *this = other; |
| } |
| |
| inline half_vec3& operator= (const half_vec3& rhs) |
| { |
| m_vals[0] = rhs.m_vals[0]; |
| m_vals[1] = rhs.m_vals[1]; |
| m_vals[2] = rhs.m_vals[2]; |
| return *this; |
| } |
| |
| inline void clear() |
| { |
| clear_obj(m_vals); |
| } |
| |
| inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z) |
| { |
| m_vals[0] = x; |
| m_vals[1] = y; |
| m_vals[2] = z; |
| return *this; |
| } |
| |
| inline half_vec3& set(float x, float y, float z) |
| { |
| m_vals[0] = basist::float_to_half(x); |
| m_vals[1] = basist::float_to_half(y); |
| m_vals[2] = basist::float_to_half(z); |
| return *this; |
| } |
| |
| template<typename T> |
| inline half_vec3& set_vec(const T& vec) |
| { |
| m_vals[0] = basist::float_to_half(vec[0]); |
| m_vals[1] = basist::float_to_half(vec[1]); |
| m_vals[2] = basist::float_to_half(vec[2]); |
| return *this; |
| } |
| |
| template<typename T> |
| inline T get_vec() const |
| { |
| return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2])); |
| } |
| |
| inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; } |
| inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; } |
| |
| float get_float_comp(uint32_t c) const |
| { |
| assert(c < 3); |
| return basist::half_to_float(m_vals[c]); |
| } |
| |
| half_vec3& set_float_comp(uint32_t c, float v) |
| { |
| assert(c < 3); |
| m_vals[c] = basist::float_to_half(v); |
| return *this; |
| } |
| }; |
| |
| struct half_vec4 |
| { |
| basist::half_float m_vals[4]; |
| |
| inline half_vec4() { } |
| |
| inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w) |
| { |
| m_vals[0] = x; |
| m_vals[1] = y; |
| m_vals[2] = z; |
| m_vals[3] = w; |
| } |
| |
| inline half_vec4(const half_vec4& other) |
| { |
| *this = other; |
| } |
| |
| inline half_vec4& operator= (const half_vec4& rhs) |
| { |
| m_vals[0] = rhs.m_vals[0]; |
| m_vals[1] = rhs.m_vals[1]; |
| m_vals[2] = rhs.m_vals[2]; |
| m_vals[3] = rhs.m_vals[3]; |
| return *this; |
| } |
| |
| inline void clear() |
| { |
| clear_obj(m_vals); |
| } |
| |
| inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w) |
| { |
| m_vals[0] = x; |
| m_vals[1] = y; |
| m_vals[2] = z; |
| m_vals[3] = w; |
| return *this; |
| } |
| |
| inline half_vec4& set(float x, float y, float z, float w) |
| { |
| m_vals[0] = basist::float_to_half(x); |
| m_vals[1] = basist::float_to_half(y); |
| m_vals[2] = basist::float_to_half(z); |
| m_vals[3] = basist::float_to_half(w); |
| return *this; |
| } |
| |
| template<typename T> |
| inline half_vec4& set_vec(const T& vec) |
| { |
| m_vals[0] = basist::float_to_half(vec[0]); |
| m_vals[1] = basist::float_to_half(vec[1]); |
| m_vals[2] = basist::float_to_half(vec[2]); |
| m_vals[3] = basist::float_to_half(vec[3]); |
| return *this; |
| } |
| |
| template<typename T> |
| inline T get_vec() const |
| { |
| return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3])); |
| } |
| |
| inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; } |
| inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; } |
| |
| float get_float_comp(uint32_t c) const |
| { |
| assert(c < 4); |
| return basist::half_to_float(m_vals[c]); |
| } |
| |
| half_vec4& set_float_comp(uint32_t c, float v) |
| { |
| assert(c < 4); |
| m_vals[c] = basist::float_to_half(v); |
| return *this; |
| } |
| }; |
| |
| const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6; |
| |
| struct trial_result |
| { |
| astc_helpers::log_astc_block m_log_blk; |
| double m_err; |
| bool m_valid; |
| }; |
| |
| //---------------------------------------------------------- |
| |
| const uint32_t NUM_PART3_MAPPINGS = 6; |
| static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] = |
| { |
| { 0, 1, 2 }, |
| { 1, 2, 0 }, |
| { 2, 0, 1 }, |
| { 0, 2, 1 }, |
| { 1, 0, 2 }, |
| { 2, 1, 0 } |
| }; |
| |
| struct partition_pattern_vec |
| { |
| uint8_t m_parts[6 * 6]; |
| |
| partition_pattern_vec() |
| { |
| clear(); |
| } |
| |
| partition_pattern_vec(const partition_pattern_vec& other) |
| { |
| *this = other; |
| } |
| |
| void clear() |
| { |
| memset(m_parts, 0, sizeof(m_parts)); |
| } |
| |
| partition_pattern_vec& operator= (const partition_pattern_vec& rhs) |
| { |
| if (this == &rhs) |
| return *this; |
| memcpy(m_parts, rhs.m_parts, 36); |
| return *this; |
| } |
| |
| uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; } |
| uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; } |
| |
| uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; } |
| uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; } |
| |
| int get_squared_distance(const partition_pattern_vec& other) const |
| { |
| int total_dist = 0; |
| for (uint32_t i = 0; i < 36; i++) |
| total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]); |
| return total_dist; |
| } |
| |
| float get_distance(const partition_pattern_vec& other) const |
| { |
| return sqrtf((float)get_squared_distance(other)); |
| } |
| |
| partition_pattern_vec get_permuted2(uint32_t permute_index) const |
| { |
| assert(permute_index <= 1); |
| |
| partition_pattern_vec res; |
| for (uint32_t i = 0; i < 36; i++) |
| { |
| assert(m_parts[i] <= 1); |
| res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index); |
| } |
| |
| return res; |
| } |
| |
| partition_pattern_vec get_permuted3(uint32_t permute_index) const |
| { |
| assert(permute_index <= 5); |
| |
| partition_pattern_vec res; |
| for (uint32_t i = 0; i < 36; i++) |
| { |
| assert(m_parts[i] <= 2); |
| res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]]; |
| } |
| |
| return res; |
| } |
| |
| partition_pattern_vec get_canonicalized() const |
| { |
| partition_pattern_vec res; |
| |
| int new_labels[3] = { -1, -1, -1 }; |
| uint32_t next_index = 0; |
| for (uint32_t i = 0; i < 36; i++) |
| { |
| uint32_t p = m_parts[i]; |
| if (new_labels[p] == -1) |
| new_labels[p] = next_index++; |
| |
| res.m_parts[i] = (uint8_t)new_labels[p]; |
| } |
| |
| return res; |
| } |
| |
| bool operator== (const partition_pattern_vec& rhs) const |
| { |
| return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0; |
| } |
| |
| operator size_t() const |
| { |
| return basist::hash_hsieh(m_parts, sizeof(m_parts)); |
| } |
| }; |
| |
| struct vp_tree_node |
| { |
| partition_pattern_vec m_vantage_point; |
| uint32_t m_point_index; |
| float m_dist; |
| |
| int m_inner_node, m_outer_node; |
| }; |
| |
| #define BRUTE_FORCE_PART_SEARCH (0) |
| |
| class vp_tree |
| { |
| public: |
| vp_tree() |
| { |
| } |
| |
| void clear() |
| { |
| m_nodes.clear(); |
| } |
| |
| // This requires no redundant patterns, i.e. all must be unique. |
| bool init(uint32_t n, const partition_pattern_vec* pUnique_pats) |
| { |
| clear(); |
| |
| uint_vec pat_indices(n); |
| for (uint32_t i = 0; i < n; i++) |
| pat_indices[i] = i; |
| |
| std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); |
| |
| if (root_idx.first == -1) |
| return false; |
| |
| m_nodes.resize(1); |
| m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first]; |
| m_nodes[0].m_point_index = root_idx.first; |
| m_nodes[0].m_dist = root_idx.second; |
| m_nodes[0].m_inner_node = -1; |
| m_nodes[0].m_outer_node = -1; |
| |
| uint_vec inner_list, outer_list; |
| |
| inner_list.reserve(n / 2); |
| outer_list.reserve(n / 2); |
| |
| for (uint32_t pat_index = 0; pat_index < n; pat_index++) |
| { |
| if ((int)pat_index == root_idx.first) |
| continue; |
| |
| const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]); |
| |
| if (dist <= root_idx.second) |
| inner_list.push_back(pat_index); |
| else |
| outer_list.push_back(pat_index); |
| } |
| |
| if (inner_list.size()) |
| { |
| m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list); |
| if (m_nodes[0].m_inner_node < 0) |
| return false; |
| } |
| |
| if (outer_list.size()) |
| { |
| m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list); |
| if (m_nodes[0].m_outer_node < 0) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| struct result |
| { |
| uint32_t m_pat_index; |
| uint32_t m_mapping_index; |
| float m_dist; |
| |
| bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; } |
| bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; } |
| }; |
| |
| class result_queue |
| { |
| enum { MaxSupportedSize = 256 + 1 }; |
| |
| public: |
| result_queue() : |
| m_cur_size(0) |
| { |
| } |
| |
| size_t get_size() const |
| { |
| return m_cur_size; |
| } |
| |
| bool empty() const |
| { |
| return !m_cur_size; |
| } |
| |
| typedef std::array<result, MaxSupportedSize + 1> result_array_type; |
| |
| const result_array_type& get_elements() const { return m_elements; } |
| result_array_type& get_elements() { return m_elements; } |
| |
| void clear() |
| { |
| m_cur_size = 0; |
| } |
| |
| void reserve(uint32_t n) |
| { |
| BASISU_NOTE_UNUSED(n); |
| } |
| |
| const result& top() const |
| { |
| assert(m_cur_size); |
| return m_elements[1]; |
| } |
| |
| bool insert(const result& val, uint32_t max_size) |
| { |
| assert(max_size < MaxSupportedSize); |
| |
| if (m_cur_size >= MaxSupportedSize) |
| return false; |
| |
| m_elements[++m_cur_size] = val; |
| up_heap(m_cur_size); |
| |
| if (m_cur_size > max_size) |
| pop(); |
| |
| return true; |
| } |
| |
| bool pop() |
| { |
| if (m_cur_size == 0) |
| return false; |
| |
| m_elements[1] = m_elements[m_cur_size--]; |
| down_heap(1); |
| return true; |
| } |
| |
| float get_highest_dist() const |
| { |
| if (!m_cur_size) |
| return 0.0f; |
| |
| return top().m_dist; |
| } |
| |
| private: |
| result_array_type m_elements; |
| size_t m_cur_size; |
| |
| void up_heap(size_t index) |
| { |
| while ((index > 1) && (m_elements[index] > m_elements[index >> 1])) |
| { |
| std::swap(m_elements[index], m_elements[index >> 1]); |
| index >>= 1; |
| } |
| } |
| |
| void down_heap(size_t index) |
| { |
| for ( ; ; ) |
| { |
| size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1; |
| |
| if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest])) |
| largest = left_child; |
| |
| if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest])) |
| largest = right_child; |
| |
| if (largest == index) |
| break; |
| |
| std::swap(m_elements[index], m_elements[largest]); |
| index = largest; |
| } |
| } |
| }; |
| |
| void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results) |
| { |
| assert((num_subsets >= 2) && (num_subsets <= 3)); |
| |
| results.clear(); |
| |
| if (!m_nodes.size()) |
| return; |
| |
| uint32_t num_desired_pats; |
| partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS]; |
| |
| if (num_subsets == 2) |
| { |
| num_desired_pats = 2; |
| for (uint32_t i = 0; i < 2; i++) |
| desired_pats[i] = desired_pat.get_permuted2(i); |
| } |
| else |
| { |
| num_desired_pats = NUM_PART3_MAPPINGS; |
| for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++) |
| desired_pats[i] = desired_pat.get_permuted3(i); |
| } |
| |
| #if 0 |
| find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results); |
| #else |
| find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results); |
| #endif |
| } |
| |
| private: |
| basisu::vector<vp_tree_node> m_nodes; |
| |
| void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) |
| { |
| float best_dist_to_vantage = BIG_FLOAT_VAL; |
| uint32_t best_mapping = 0; |
| for (uint32_t i = 0; i < num_desired_pats; i++) |
| { |
| float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); |
| if (dist < best_dist_to_vantage) |
| { |
| best_dist_to_vantage = dist; |
| best_mapping = i; |
| } |
| } |
| |
| result r; |
| r.m_dist = best_dist_to_vantage; |
| r.m_mapping_index = best_mapping; |
| r.m_pat_index = m_nodes[node_index].m_point_index; |
| |
| results.insert(r, max_results); |
| |
| if (best_dist_to_vantage <= m_nodes[node_index].m_dist) |
| { |
| // inner first |
| if (m_nodes[node_index].m_inner_node >= 0) |
| find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); |
| |
| if (m_nodes[node_index].m_outer_node >= 0) |
| { |
| if ( (results.get_size() < max_results) || |
| ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) |
| ) |
| { |
| find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); |
| } |
| } |
| } |
| else |
| { |
| // outer first |
| if (m_nodes[node_index].m_outer_node >= 0) |
| find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); |
| |
| if (m_nodes[node_index].m_inner_node >= 0) |
| { |
| if ( (results.get_size() < max_results) || |
| ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) |
| ) |
| { |
| find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); |
| } |
| } |
| } |
| } |
| |
| void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) |
| { |
| uint_vec node_stack; |
| node_stack.reserve(16); |
| node_stack.push_back(init_node_index); |
| |
| do |
| { |
| const uint32_t node_index = node_stack.back(); |
| node_stack.pop_back(); |
| |
| float best_dist_to_vantage = BIG_FLOAT_VAL; |
| uint32_t best_mapping = 0; |
| for (uint32_t i = 0; i < num_desired_pats; i++) |
| { |
| float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); |
| if (dist < best_dist_to_vantage) |
| { |
| best_dist_to_vantage = dist; |
| best_mapping = i; |
| } |
| } |
| |
| result r; |
| r.m_dist = best_dist_to_vantage; |
| r.m_mapping_index = best_mapping; |
| r.m_pat_index = m_nodes[node_index].m_point_index; |
| |
| results.insert(r, max_results); |
| |
| if (best_dist_to_vantage <= m_nodes[node_index].m_dist) |
| { |
| if (m_nodes[node_index].m_outer_node >= 0) |
| { |
| if ((results.get_size() < max_results) || |
| ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) |
| ) |
| { |
| node_stack.push_back(m_nodes[node_index].m_outer_node); |
| } |
| } |
| |
| // inner first |
| if (m_nodes[node_index].m_inner_node >= 0) |
| { |
| node_stack.push_back(m_nodes[node_index].m_inner_node); |
| } |
| } |
| else |
| { |
| if (m_nodes[node_index].m_inner_node >= 0) |
| { |
| if ((results.get_size() < max_results) || |
| ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) |
| ) |
| { |
| node_stack.push_back(m_nodes[node_index].m_inner_node); |
| } |
| } |
| |
| // outer first |
| if (m_nodes[node_index].m_outer_node >= 0) |
| { |
| node_stack.push_back(m_nodes[node_index].m_outer_node); |
| } |
| } |
| |
| } while (!node_stack.empty()); |
| } |
| |
| // returns the index of the new node, or -1 on error |
| int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices) |
| { |
| std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); |
| |
| if (root_idx.first < 0) |
| return -1; |
| |
| m_nodes.resize(m_nodes.size() + 1); |
| const uint32_t new_node_index = m_nodes.size_u32() - 1; |
| |
| m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first]; |
| m_nodes[new_node_index].m_point_index = root_idx.first; |
| m_nodes[new_node_index].m_dist = root_idx.second; |
| m_nodes[new_node_index].m_inner_node = -1; |
| m_nodes[new_node_index].m_outer_node = -1; |
| |
| uint_vec inner_list, outer_list; |
| |
| inner_list.reserve(pat_indices.size_u32() / 2); |
| outer_list.reserve(pat_indices.size_u32() / 2); |
| |
| for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++) |
| { |
| const uint32_t pat_index = pat_indices[pat_indices_iter]; |
| |
| if ((int)pat_index == root_idx.first) |
| continue; |
| |
| const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]); |
| |
| if (dist <= root_idx.second) |
| inner_list.push_back(pat_index); |
| else |
| outer_list.push_back(pat_index); |
| } |
| |
| if (inner_list.size()) |
| m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list); |
| |
| if (outer_list.size()) |
| m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list); |
| |
| return new_node_index; |
| } |
| |
| // returns the pattern index of the vantage point (-1 on error), and the optimal split distance |
| std::pair<int, float> find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices) |
| { |
| BASISU_NOTE_UNUSED(num_unique_pats); |
| |
| const uint32_t n = pat_indices.size_u32(); |
| |
| assert(n); |
| if (n == 1) |
| return std::pair(pat_indices[0], 0.0f); |
| |
| float best_split_metric = -1.0f; |
| int best_split_pat = -1; |
| float best_split_dist = 0.0f; |
| float best_split_var = 0.0f; |
| |
| basisu::vector< std::pair<float, uint32_t> > dists; |
| dists.reserve(n); |
| |
| float_vec float_dists; |
| float_dists.reserve(n); |
| |
| for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++) |
| { |
| const uint32_t split_pat_index = pat_indices[pat_indices_iter]; |
| assert(split_pat_index < num_unique_pats); |
| |
| const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index]; |
| |
| dists.resize(0); |
| float_dists.resize(0); |
| |
| for (uint32_t j = 0; j < n; j++) |
| { |
| const uint32_t pat_index = pat_indices[j]; |
| assert(pat_index < num_unique_pats); |
| |
| if (pat_index == split_pat_index) |
| continue; |
| |
| float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); |
| dists.emplace_back(std::pair(dist, pat_index)); |
| |
| float_dists.push_back(dist); |
| } |
| |
| stats<double> s; |
| s.calc(float_dists.size_u32(), float_dists.data()); |
| |
| std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) { |
| return a.first < b.first; |
| }); |
| |
| const uint32_t num_dists = dists.size_u32(); |
| float split_dist = dists[num_dists / 2].first; |
| if ((num_dists & 1) == 0) |
| split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f; |
| |
| uint32_t total_inner = 0, total_outer = 0; |
| |
| for (uint32_t j = 0; j < n; j++) |
| { |
| const uint32_t pat_index = pat_indices[j]; |
| if (pat_index == split_pat_index) |
| continue; |
| |
| float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); |
| |
| if (dist <= split_dist) |
| total_inner++; |
| else |
| total_outer++; |
| } |
| |
| float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer); |
| |
| if ( (split_metric > best_split_metric) || |
| ((split_metric == best_split_metric) && (s.m_var > best_split_var)) ) |
| { |
| best_split_metric = split_metric; |
| best_split_dist = split_dist; |
| best_split_pat = split_pat_index; |
| best_split_var = (float)s.m_var; |
| } |
| } |
| |
| return std::pair(best_split_pat, best_split_dist); |
| } |
| }; |
| |
| struct partition |
| { |
| uint64_t m_p; |
| |
| inline partition() : |
| m_p(0) |
| { |
| } |
| |
| inline partition(uint64_t p) : |
| m_p(p) |
| { |
| assert(p < (1ULL << 36)); |
| } |
| |
| inline partition& operator=(uint64_t p) |
| { |
| assert(p < (1ULL << 36)); |
| m_p = p; |
| return *this; |
| } |
| |
| inline bool operator< (const partition& p) const |
| { |
| return m_p < p.m_p; |
| } |
| |
| inline bool operator== (const partition& p) const |
| { |
| return m_p == p.m_p; |
| } |
| |
| inline operator size_t() const |
| { |
| return basist::hash_hsieh((const uint8_t *)&m_p, sizeof(m_p)); |
| } |
| }; |
| |
| partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2]; |
| int g_part2_seed_to_unique_index[1024]; |
| vp_tree g_part2_vp_tree; |
| |
| static inline vec3F vec3F_norm_approx(vec3F axis) |
| { |
| float l = axis.norm(); |
| axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f); |
| return axis; |
| } |
| |
| static void init_partitions2_6x6() |
| { |
| #if 0 |
| // makes pattern bits to the 10-bit ASTC seed index |
| typedef basisu::hash_map<uint64_t, uint32_t> partition2_hash_map; |
| partition2_hash_map phash; |
| phash.reserve(1024); |
| |
| for (uint32_t i = 0; i < 1024; i++) |
| { |
| uint64_t p_bits = 0; |
| uint64_t p_bits_inv = 0; |
| |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false); |
| assert(p < 2); |
| |
| p_bits |= (p << (x + y * 6)); |
| p_bits_inv |= ((1 - p) << (x + y * 6)); |
| } |
| } |
| |
| if (!p_bits) |
| continue; |
| if (p_bits == ((1ULL << 36) - 1)) |
| continue; |
| |
| assert(p_bits < (1ULL << 36)); |
| assert(p_bits_inv < (1ULL << 36)); |
| |
| if (phash.contains(p_bits)) |
| { |
| } |
| else if (phash.contains(p_bits_inv)) |
| { |
| } |
| else |
| { |
| auto res = phash.insert(p_bits, i); |
| assert(res.second); |
| BASISU_NOTE_UNUSED(res); |
| } |
| } |
| |
| uint32_t num_unique_partitions2 = 0; |
| |
| for (const auto& r : phash) |
| { |
| assert(r.second < 1024); |
| |
| const uint32_t unique_index = num_unique_partitions2; |
| assert(unique_index < NUM_UNIQUE_PARTITIONS2); |
| |
| partition_pattern_vec pat_vec; |
| for (uint32_t i = 0; i < 36; i++) |
| pat_vec[i] = (uint8_t)((r.first >> i) & 1); |
| |
| g_partitions2[unique_index] = pat_vec; |
| |
| assert(g_part2_unique_index_to_seed[unique_index] == r.second); |
| g_part2_seed_to_unique_index[r.second] = unique_index; |
| |
| num_unique_partitions2++; |
| } |
| assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2); |
| #else |
| for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++) |
| { |
| const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index]; |
| assert(seed_index < 1024); |
| |
| assert(g_part2_seed_to_unique_index[seed_index] == 0); |
| g_part2_seed_to_unique_index[seed_index] = unique_index; |
| |
| partition_pattern_vec& pat_vec = g_partitions2[unique_index]; |
| |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false); |
| assert(p < 2); |
| |
| pat_vec[x + y * 6] = p; |
| } |
| } |
| } |
| #endif |
| |
| g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2); |
| } |
| |
| static bool estimate_partition2_6x6( |
| const basist::half_float pBlock_pixels_half[][3], |
| int* pBest_parts, uint32_t num_best_parts) |
| { |
| const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H; |
| |
| vec3F training_vecs[BLOCK_T], mean(0.0f); |
| |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| vec3F& v = training_vecs[i]; |
| |
| v[0] = (float)pBlock_pixels_half[i][0]; |
| v[1] = (float)pBlock_pixels_half[i][1]; |
| v[2] = (float)pBlock_pixels_half[i][2]; |
| |
| mean += v; |
| } |
| mean *= (1.0f / (float)BLOCK_T); |
| |
| vec3F max_vals(-BIG_FLOAT_VAL); |
| |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| vec3F& v = training_vecs[i]; |
| max_vals = vec3F::component_max(max_vals, v); |
| } |
| |
| // Initialize principle axis approximation |
| vec3F axis(max_vals - mean); |
| |
| // Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x). |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| axis = vec3F_norm_approx(axis); |
| |
| vec3F color(training_vecs[i] - mean); |
| |
| float d = color.dot(axis); |
| |
| axis += color * d; |
| } |
| |
| if (axis.norm() < SMALL_FLOAT_VAL) |
| axis.set(0.57735027f); |
| else |
| axis.normalize_in_place(); |
| |
| #if BRUTE_FORCE_PART_SEARCH |
| int desired_parts[BLOCK_H][BLOCK_W]; // [y][x] |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| float proj = (training_vecs[i] - mean).dot(axis); |
| |
| desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f; |
| } |
| #else |
| partition_pattern_vec desired_part; |
| |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| float proj = (training_vecs[i] - mean).dot(axis); |
| |
| desired_part.m_parts[i] = proj < 0.0f; |
| } |
| #endif |
| |
| //interval_timer tm; |
| //tm.start(); |
| |
| #if BRUTE_FORCE_PART_SEARCH |
| uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2]; |
| |
| for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++) |
| { |
| const partition_pattern_vec &pat_vec = g_partitions2[part_index]; |
| |
| int total_sim_non_inv = 0; |
| int total_sim_inv = 0; |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| int part = pat_vec[x + y * 6]; |
| |
| if (part == desired_parts[y][x]) |
| total_sim_non_inv++; |
| |
| if ((part ^ 1) == desired_parts[y][x]) |
| total_sim_inv++; |
| } |
| } |
| |
| int total_sim = maximum(total_sim_non_inv, total_sim_inv); |
| |
| part_similarity[part_index] = (total_sim << 16) | part_index; |
| |
| } // part_index; |
| |
| std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2); |
| |
| for (uint32_t i = 0; i < num_best_parts; i++) |
| pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF; |
| #else |
| vp_tree::result_queue results; |
| results.reserve(num_best_parts); |
| g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts); |
| |
| assert(results.get_size() == num_best_parts); |
| |
| const auto& elements = results.get_elements(); |
| |
| for (uint32_t i = 0; i < results.get_size(); i++) |
| pBest_parts[i] = elements[1 + i].m_pat_index; |
| #endif |
| |
| //fmt_printf("{} ", tm.get_elapsed_ms()); |
| |
| return true; |
| } |
| |
| const uint32_t MIN_REFINE_LEVEL = 0; |
| |
| static bool encode_block_2_subsets( |
| trial_result res[2], |
| uint32_t grid_w, uint32_t grid_h, |
| uint32_t cem, |
| uint32_t weights_ise_range, uint32_t endpoints_ise_range, |
| const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16, |
| astc_hdr_codec_base_options& coptions, |
| bool uber_mode_flag, |
| int unique_pat_index, |
| uint32_t comp_level, |
| opt_mode_t mode11_opt_mode, |
| bool refine_endpoints_flag) |
| { |
| const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS; |
| |
| res[0].m_valid = false; |
| res[1].m_valid = false; |
| |
| const uint32_t BLOCK_W = 6, BLOCK_H = 6; |
| |
| astc_helpers::log_astc_block best_log_blk; |
| clear_obj(best_log_blk); |
| |
| best_log_blk.m_num_partitions = 2; |
| best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem; |
| best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem; |
| best_log_blk.m_grid_width = (uint8_t)grid_w; |
| best_log_blk.m_grid_height = (uint8_t)grid_h; |
| |
| best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range; |
| best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range; |
| |
| partition_pattern_vec* pPat = &g_partitions2[unique_pat_index]; |
| const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index]; |
| |
| vec4F part_pixels_q16[2][64]; |
| half_vec3 part_half_pixels[2][64]; |
| uint8_t part_pixel_index[2][64]; |
| uint32_t part_total_pixels[2] = { 0 }; |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| uint32_t part_index = (*pPat)[x + y * BLOCK_W]; |
| |
| uint32_t l = part_total_pixels[part_index]; |
| |
| part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W]; |
| part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W]; |
| part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W); |
| |
| part_total_pixels[part_index] = l + 1; |
| } // x |
| } // y |
| |
| uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS]; |
| uint8_t blk_weights[2][BLOCK_W * BLOCK_H]; |
| uint32_t best_submode[2]; |
| |
| for (uint32_t part_iter = 0; part_iter < 2; part_iter++) |
| { |
| assert(part_total_pixels[part_iter]); |
| |
| double e; |
| if (cem == 7) |
| { |
| e = encode_astc_hdr_block_mode_7( |
| part_total_pixels[part_iter], |
| (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], |
| best_log_blk.m_weight_ise_range, |
| best_submode[part_iter], |
| BIG_FLOAT_VAL, |
| blk_endpoints[part_iter], |
| blk_weights[part_iter], |
| coptions, |
| best_log_blk.m_endpoint_ise_range); |
| } |
| else |
| { |
| assert(cem == 11); |
| |
| e = encode_astc_hdr_block_mode_11( |
| part_total_pixels[part_iter], |
| (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], |
| best_log_blk.m_weight_ise_range, |
| best_submode[part_iter], |
| BIG_FLOAT_VAL, |
| blk_endpoints[part_iter], |
| blk_weights[part_iter], |
| coptions, |
| false, |
| best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false, |
| mode11_opt_mode); |
| } |
| |
| if (e == BIG_FLOAT_VAL) |
| return false; |
| |
| } // part_iter |
| |
| uint8_t ise_weights[BLOCK_W * BLOCK_H]; |
| |
| uint32_t src_pixel_index[2] = { 0, 0 }; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| uint32_t part_index = (*pPat)[x + y * BLOCK_W]; |
| ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; |
| src_pixel_index[part_index]++; |
| } // x |
| } // y |
| |
| if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H)) |
| { |
| best_log_blk.m_partition_id = (uint16_t)p_seed; |
| |
| memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals); |
| memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals); |
| memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H); |
| |
| res[0].m_valid = true; |
| res[0].m_log_blk = best_log_blk; |
| } |
| else |
| { |
| uint8_t desired_weights[BLOCK_H * BLOCK_W]; |
| |
| const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val; |
| |
| for (uint32_t by = 0; by < BLOCK_H; by++) |
| for (uint32_t bx = 0; bx < BLOCK_W; bx++) |
| desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]]; |
| |
| uint8_t downsampled_weights[BLOCK_H * BLOCK_W]; |
| |
| const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h); |
| if (!pDownsample_matrix) |
| { |
| assert(0); |
| return false; |
| } |
| |
| downsample_weight_grid( |
| pDownsample_matrix, |
| BLOCK_W, BLOCK_H, // source/from dimension (block size) |
| grid_w, grid_h, // dest/to dimension (grid size) |
| desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] |
| downsampled_weights); // [wy][wx] |
| |
| best_log_blk.m_partition_id = (uint16_t)p_seed; |
| memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals); |
| memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals); |
| |
| const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise; |
| |
| for (uint32_t gy = 0; gy < grid_h; gy++) |
| for (uint32_t gx = 0; gx < grid_w; gx++) |
| best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]]; |
| |
| res[0].m_valid = true; |
| res[0].m_log_blk = best_log_blk; |
| |
| if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6))) |
| { |
| bool any_refined = false; |
| |
| for (uint32_t part_iter = 0; part_iter < 2; part_iter++) |
| { |
| bool refine_status = refine_endpoints( |
| cem, |
| endpoints_ise_range, |
| best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize |
| BLOCK_W, BLOCK_H, // block dimensions |
| grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid |
| part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], |
| &part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets |
| coptions, mode11_opt_mode); |
| |
| if (refine_status) |
| any_refined = true; |
| } |
| |
| if (any_refined) |
| { |
| res[1].m_valid = true; |
| res[1].m_log_blk = best_log_blk; |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| typedef basisu::hash_map<partition_pattern_vec, std::pair<uint32_t, uint32_t > > partition3_hash_map; |
| |
| partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3]; |
| int g_part3_seed_to_unique_index[1024]; |
| vp_tree g_part3_vp_tree; |
| |
| static void init_partitions3_6x6() |
| { |
| uint32_t t = 0; |
| |
| for (uint32_t i = 0; i < 1024; i++) |
| g_part3_seed_to_unique_index[i] = -1; |
| |
| partition3_hash_map part3_hash; |
| part3_hash.reserve(512); |
| |
| for (uint32_t seed_index = 0; seed_index < 1024; seed_index++) |
| { |
| partition_pattern_vec p3; |
| uint32_t part_hist[3] = { 0 }; |
| |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false); |
| assert(p < 3); |
| |
| p3.m_parts[x + y * 6] = (uint8_t)p; |
| part_hist[p]++; |
| } |
| } |
| |
| if (!part_hist[0] || !part_hist[1] || !part_hist[2]) |
| continue; |
| |
| uint32_t j; |
| for (j = 0; j < NUM_PART3_MAPPINGS; j++) |
| { |
| partition_pattern_vec temp_part3(p3.get_permuted3(j)); |
| |
| if (part3_hash.contains(temp_part3)) |
| break; |
| } |
| if (j < NUM_PART3_MAPPINGS) |
| continue; |
| |
| part3_hash.insert(p3, std::make_pair(seed_index, t) ); |
| |
| assert(g_part3_unique_index_to_seed[t] == seed_index); |
| g_part3_seed_to_unique_index[seed_index] = t; |
| g_partitions3[t] = p3; |
| |
| t++; |
| } |
| |
| g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3); |
| } |
| |
| static bool estimate_partition3_6x6( |
| const basist::half_float pBlock_pixels_half[][3], |
| int* pBest_parts, uint32_t num_best_parts) |
| { |
| const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3; |
| |
| assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3)); |
| |
| vec3F training_vecs[BLOCK_T], mean(0.0f); |
| |
| float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL; |
| vec3F cluster_centroids[NUM_SUBSETS]; |
| clear_obj(cluster_centroids); |
| |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| vec3F& v = training_vecs[i]; |
| |
| v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]); |
| |
| float inten = v.dot(vec3F(1.0f)); |
| if (inten < darkest_inten) |
| { |
| darkest_inten = inten; |
| cluster_centroids[0] = v; |
| } |
| |
| if (inten > brightest_inten) |
| { |
| brightest_inten = inten; |
| cluster_centroids[1] = v; |
| } |
| } |
| |
| if (cluster_centroids[0] == cluster_centroids[1]) |
| return false; |
| |
| float furthest_dist2 = 0.0f; |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| vec3F& v = training_vecs[i]; |
| |
| float dist_a = v.squared_distance(cluster_centroids[0]); |
| if (dist_a == 0.0f) |
| continue; |
| |
| float dist_b = v.squared_distance(cluster_centroids[1]); |
| if (dist_b == 0.0f) |
| continue; |
| |
| float dist2 = dist_a + dist_b; |
| if (dist2 > furthest_dist2) |
| { |
| furthest_dist2 = dist2; |
| cluster_centroids[2] = v; |
| } |
| } |
| |
| if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2])) |
| return false; |
| |
| uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T]; |
| uint32_t num_cluster_pixels[NUM_SUBSETS]; |
| vec3F new_cluster_means[NUM_SUBSETS]; |
| |
| const uint32_t NUM_ITERS = 4; |
| |
| for (uint32_t s = 0; s < NUM_ITERS; s++) |
| { |
| memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels)); |
| memset((void *)new_cluster_means, 0, sizeof(new_cluster_means)); |
| |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| float d[NUM_SUBSETS] = { |
| training_vecs[i].squared_distance(cluster_centroids[0]), |
| training_vecs[i].squared_distance(cluster_centroids[1]), |
| training_vecs[i].squared_distance(cluster_centroids[2]) }; |
| |
| float min_d = d[0]; |
| uint32_t min_idx = 0; |
| for (uint32_t j = 1; j < NUM_SUBSETS; j++) |
| { |
| if (d[j] < min_d) |
| { |
| min_d = d[j]; |
| min_idx = j; |
| } |
| } |
| |
| cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i; |
| new_cluster_means[min_idx] += training_vecs[i]; |
| num_cluster_pixels[min_idx]++; |
| } // i |
| |
| for (uint32_t j = 0; j < NUM_SUBSETS; j++) |
| { |
| if (!num_cluster_pixels[j]) |
| return false; |
| |
| cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j]; |
| } |
| } // s |
| |
| partition_pattern_vec desired_part; |
| for (uint32_t p = 0; p < NUM_SUBSETS; p++) |
| { |
| for (uint32_t i = 0; i < num_cluster_pixels[p]; i++) |
| { |
| const uint32_t pix_index = cluster_pixels[p][i]; |
| desired_part[pix_index] = (uint8_t)p; |
| } |
| } |
| |
| #if BRUTE_FORCE_PART_SEARCH |
| partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS]; |
| for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++) |
| desired_parts[j] = desired_part.get_permuted3(j); |
| |
| uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3]; |
| |
| for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++) |
| { |
| const partition_pattern_vec& pat = g_partitions3[part_index]; |
| |
| uint32_t lowest_pat_dist = UINT32_MAX; |
| for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++) |
| { |
| uint32_t dist = pat.get_squared_distance(desired_parts[p]); |
| if (dist < lowest_pat_dist) |
| lowest_pat_dist = dist; |
| } |
| |
| part_similarity[part_index] = (lowest_pat_dist << 16) | part_index; |
| |
| } // part_index; |
| |
| std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3); |
| |
| for (uint32_t i = 0; i < num_best_parts; i++) |
| pBest_parts[i] = part_similarity[i] & 0xFFFF; |
| #else |
| vp_tree::result_queue results; |
| results.reserve(num_best_parts); |
| g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts); |
| |
| assert(results.get_size() == num_best_parts); |
| |
| const auto& elements = results.get_elements(); |
| |
| for (uint32_t i = 0; i < results.get_size(); i++) |
| pBest_parts[i] = elements[1 + i].m_pat_index; |
| #endif |
| |
| return true; |
| } |
| |
| static bool encode_block_3_subsets( |
| trial_result& res, |
| uint32_t cem, |
| uint32_t grid_w, uint32_t grid_h, |
| uint32_t weights_ise_range, uint32_t endpoints_ise_range, |
| const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16, |
| astc_hdr_codec_base_options& coptions, |
| bool uber_mode_flag, |
| const int* pEst_patterns, int num_est_patterns, |
| uint32_t comp_level, |
| opt_mode_t mode11_opt_mode) |
| { |
| BASISU_NOTE_UNUSED(uber_mode_flag); |
| const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3; |
| const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem); |
| |
| res.m_valid = false; |
| |
| double best_e = BIG_FLOAT_VAL; |
| |
| astc_helpers::log_astc_block best_log_blk; |
| clear_obj(best_log_blk); |
| |
| best_log_blk.m_num_partitions = NUM_SUBSETS; |
| best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem; |
| best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem; |
| best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem; |
| best_log_blk.m_grid_width = (uint8_t)grid_w; |
| best_log_blk.m_grid_height = (uint8_t)grid_h; |
| |
| best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range; |
| best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range; |
| |
| const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3; |
| |
| for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++) |
| { |
| const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter; |
| assert(unique_part_index < NUM_UNIQUE_PARTITIONS3); |
| const partition_pattern_vec*pPart = &g_partitions3[unique_part_index]; |
| |
| vec4F part_pixels_q16[NUM_SUBSETS][64]; |
| half_vec3 part_half_pixels[NUM_SUBSETS][64]; |
| uint8_t part_pixel_index[NUM_SUBSETS][64]; |
| uint32_t part_total_pixels[NUM_SUBSETS] = { 0 }; |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W]; |
| |
| uint32_t l = part_total_pixels[part_index]; |
| |
| part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W]; |
| part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W]; |
| part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W); |
| |
| part_total_pixels[part_index] = l + 1; |
| } // x |
| } // y |
| |
| uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS]; |
| uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H]; |
| uint32_t best_submode[NUM_SUBSETS]; |
| |
| bool failed_flag = false; |
| double e = 0.0f; |
| for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++) |
| { |
| assert(part_total_pixels[part_iter]); |
| |
| double part_e; |
| if (cem == 7) |
| { |
| part_e = encode_astc_hdr_block_mode_7( |
| part_total_pixels[part_iter], |
| (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], |
| best_log_blk.m_weight_ise_range, |
| best_submode[part_iter], |
| BIG_FLOAT_VAL, |
| blk_endpoints[part_iter], |
| blk_weights[part_iter], |
| coptions, |
| best_log_blk.m_endpoint_ise_range); |
| } |
| else |
| { |
| assert(cem == 11); |
| |
| part_e = encode_astc_hdr_block_mode_11( |
| part_total_pixels[part_iter], |
| (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], |
| best_log_blk.m_weight_ise_range, |
| best_submode[part_iter], |
| BIG_FLOAT_VAL, |
| blk_endpoints[part_iter], |
| blk_weights[part_iter], |
| coptions, |
| false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, |
| FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode); |
| } |
| |
| if (part_e == BIG_FLOAT_VAL) |
| { |
| failed_flag = true; |
| break; |
| } |
| e += part_e; |
| } // part_iter |
| |
| if (failed_flag) |
| continue; |
| |
| uint8_t ise_weights[BLOCK_W * BLOCK_H]; |
| |
| uint32_t src_pixel_index[NUM_SUBSETS] = { 0 }; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W]; |
| |
| ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; |
| src_pixel_index[part_index]++; |
| } // x |
| } // y |
| |
| if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H)) |
| { |
| if (e < best_e) |
| { |
| best_e = e; |
| best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index]; |
| |
| for (uint32_t p = 0; p < NUM_SUBSETS; p++) |
| memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals); |
| |
| memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H); |
| } |
| } |
| else |
| { |
| uint8_t desired_weights[BLOCK_H * BLOCK_W]; |
| |
| const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val; |
| |
| for (uint32_t by = 0; by < BLOCK_H; by++) |
| for (uint32_t bx = 0; bx < BLOCK_W; bx++) |
| desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]]; |
| |
| uint8_t downsampled_weights[BLOCK_H * BLOCK_W]; |
| |
| const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h); |
| if (!pDownsample_matrix) |
| { |
| assert(0); |
| return false; |
| } |
| |
| downsample_weight_grid( |
| pDownsample_matrix, |
| BLOCK_W, BLOCK_H, // source/from dimension (block size) |
| grid_w, grid_h, // dest/to dimension (grid size) |
| desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] |
| downsampled_weights); // [wy][wx] |
| |
| astc_helpers::log_astc_block trial_blk(best_log_blk); |
| |
| trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index]; |
| |
| for (uint32_t p = 0; p < NUM_SUBSETS; p++) |
| memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals); |
| |
| const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise; |
| |
| for (uint32_t gy = 0; gy < grid_h; gy++) |
| for (uint32_t gx = 0; gx < grid_w; gx++) |
| trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]]; |
| |
| if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6))) |
| { |
| for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++) |
| { |
| bool refine_status = refine_endpoints( |
| cem, |
| endpoints_ise_range, |
| trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize |
| BLOCK_W, BLOCK_H, // block dimensions |
| grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid |
| part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], |
| &part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets |
| coptions, mode11_opt_mode); |
| |
| BASISU_NOTE_UNUSED(refine_status); |
| } |
| } |
| |
| half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x] |
| bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16); |
| assert(status); |
| if (!status) |
| return false; |
| |
| half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W]; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]); |
| |
| double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions); |
| if (trial_err < best_e) |
| { |
| best_e = trial_err; |
| best_log_blk = trial_blk; |
| } |
| } |
| |
| } // unique_p_iter |
| |
| if (best_e < BIG_FLOAT_VAL) |
| { |
| res.m_log_blk = best_log_blk; |
| res.m_valid = true; |
| res.m_err = best_e; |
| } |
| else |
| { |
| res.m_valid = false; |
| } |
| |
| return res.m_valid; |
| } |
| |
| static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range) |
| { |
| const uint32_t MAX_VALS = 64; |
| uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3]; |
| uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1; |
| |
| assert((total_values) && (total_values <= MAX_VALS)); |
| |
| const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0]; |
| const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1]; |
| const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2]; |
| |
| for (uint32_t i = 0; i < total_values; i++) |
| { |
| uint32_t val = pVals[i]; |
| |
| uint32_t bits = val & ((1 << ep_bits) - 1); |
| uint32_t tq = val >> ep_bits; |
| |
| bit_values[i] = bits; |
| |
| if (ep_trits) |
| { |
| assert(tq < 3); |
| tq_accum += tq * tq_mul; |
| tq_mul *= 3; |
| if (tq_mul == 243) |
| { |
| assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values)); |
| tq_values[total_tq_values++] = tq_accum; |
| tq_accum = 0; |
| tq_mul = 1; |
| } |
| } |
| else if (ep_quints) |
| { |
| assert(tq < 5); |
| tq_accum += tq * tq_mul; |
| tq_mul *= 5; |
| if (tq_mul == 125) |
| { |
| assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values)); |
| tq_values[total_tq_values++] = tq_accum; |
| tq_accum = 0; |
| tq_mul = 1; |
| } |
| } |
| } |
| |
| uint32_t total_bits_output = 0; |
| |
| for (uint32_t i = 0; i < total_tq_values; i++) |
| { |
| const uint32_t num_bits = ep_trits ? 8 : 7; |
| coder.put_bits(tq_values[i], num_bits); |
| total_bits_output += num_bits; |
| } |
| |
| if (tq_mul > 1) |
| { |
| uint32_t num_bits; |
| if (ep_trits) |
| { |
| if (tq_mul == 3) |
| num_bits = 2; |
| else if (tq_mul == 9) |
| num_bits = 4; |
| else if (tq_mul == 27) |
| num_bits = 5; |
| else //if (tq_mul == 81) |
| num_bits = 7; |
| } |
| else |
| { |
| if (tq_mul == 5) |
| num_bits = 3; |
| else //if (tq_mul == 25) |
| num_bits = 5; |
| } |
| coder.put_bits(tq_accum, num_bits); |
| total_bits_output += num_bits; |
| } |
| |
| for (uint32_t i = 0; i < total_values; i++) |
| { |
| coder.put_bits(bit_values[i], ep_bits); |
| total_bits_output += ep_bits; |
| } |
| |
| return total_bits_output; |
| } |
| |
| static inline uint32_t get_num_endpoint_vals(uint32_t cem) |
| { |
| assert((cem == 7) || (cem == 11)); |
| return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS; |
| } |
| |
| static void code_block(bitwise_coder& coder, |
| const astc_helpers::log_astc_block& log_blk, |
| block_mode block_mode_index, |
| endpoint_mode em, const uint8_t *pEP_deltas) |
| { |
| coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes); |
| coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal); |
| |
| const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]); |
| |
| if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta)) |
| { |
| assert(log_blk.m_num_partitions == 1); |
| |
| for (uint32_t i = 0; i < num_endpoint_vals; i++) |
| coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS); |
| } |
| else if (em == endpoint_mode::cRaw) |
| { |
| if (log_blk.m_num_partitions == 2) |
| { |
| const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id]; |
| assert(unique_partition_index != -1); |
| |
| coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2); |
| } |
| else if (log_blk.m_num_partitions == 3) |
| { |
| const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id]; |
| assert(unique_partition_index != -1); |
| |
| coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3); |
| } |
| |
| encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range); |
| } |
| |
| encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range); |
| } |
| |
| struct smooth_map_params |
| { |
| bool m_no_mse_scaling; |
| |
| float m_max_smooth_std_dev; |
| float m_smooth_max_mse_scale; |
| |
| float m_max_med_smooth_std_dev; |
| float m_med_smooth_max_mse_scale; |
| |
| float m_max_ultra_smooth_std_dev; |
| float m_ultra_smooth_max_mse_scale; |
| |
| bool m_debug_images; |
| |
| smooth_map_params() |
| { |
| clear(); |
| } |
| |
| void clear() |
| { |
| m_no_mse_scaling = false; |
| |
| // 3x3 region |
| m_max_smooth_std_dev = 100.0f; |
| m_smooth_max_mse_scale = 13000.0f; |
| |
| // 7x7 region |
| m_max_med_smooth_std_dev = 9.0f; |
| m_med_smooth_max_mse_scale = 15000.0f; |
| |
| // 11x11 region |
| m_max_ultra_smooth_std_dev = 4.0f; |
| //m_ultra_smooth_max_mse_scale = 4500.0f; |
| //m_ultra_smooth_max_mse_scale = 10000.0f; |
| //m_ultra_smooth_max_mse_scale = 50000.0f; |
| //m_ultra_smooth_max_mse_scale = 100000.0f; |
| //m_ultra_smooth_max_mse_scale = 400000.0f; |
| //m_ultra_smooth_max_mse_scale = 800000.0f; |
| m_ultra_smooth_max_mse_scale = 2000000.0f; |
| |
| m_debug_images = true; |
| } |
| }; |
| |
| Resampler::Contrib_List* g_contrib_lists[7]; // 1-6 |
| |
| static void init_contrib_lists() |
| { |
| for (uint32_t dst_width = 1; dst_width <= 6; dst_width++) |
| //g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f); |
| g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f); |
| } |
| |
| #if 0 |
| static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16) |
| { |
| vec3F temp_block[6][6]; // [y][x] |
| |
| // first filter rows to temp_block |
| if (grid_x == 6) |
| { |
| memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6); |
| } |
| else |
| { |
| Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; |
| |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| vec3F p(0.0f); |
| |
| for (uint32_t i = 0; i < pRow_lists[x].n; i++) |
| p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight; |
| |
| p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); |
| |
| temp_block[y][x] = p; |
| } // x |
| } // y |
| } |
| |
| // filter columns |
| if (grid_y == 6) |
| { |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| for (uint32_t c = 0; c < 3; c++) |
| { |
| const basist::half_float h = basist::float_to_half(temp_block[y][x][c]); |
| |
| pDst_block_half3[x + y * 6][c] = h; |
| pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h); |
| } |
| |
| pDst_block_q16[x + y * 6][3] = 0.0f; |
| } // x |
| } // y |
| } |
| else |
| { |
| Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; |
| |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| vec3F p(0.0f); |
| |
| for (uint32_t i = 0; i < pCol_lists[y].n; i++) |
| p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; |
| |
| p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); |
| |
| for (uint32_t c = 0; c < 3; c++) |
| { |
| const basist::half_float h = basist::float_to_half(p[c]); |
| |
| pDst_block_half3[x + y * 6][c] = h; |
| pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h); |
| } |
| |
| pDst_block_q16[x + y * 6][3] = 0.0f; |
| |
| } // x |
| } // y |
| } |
| } |
| #endif |
| |
| static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block) |
| { |
| vec4F temp_block[6][6]; // [y][x] |
| |
| // first filter rows to temp_block |
| if (grid_x == 6) |
| { |
| memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6); |
| } |
| else |
| { |
| Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; |
| |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| vec3F p(0.0f); |
| |
| for (uint32_t i = 0; i < pRow_lists[x].n; i++) |
| p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight; |
| |
| p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); |
| |
| temp_block[y][x] = p; |
| } // x |
| } // y |
| } |
| |
| // filter columns |
| if (grid_y == 6) |
| { |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| for (uint32_t c = 0; c < 3; c++) |
| pDst_block[x + y * 6][c] = temp_block[y][x][c]; |
| } // x |
| } // y |
| } |
| else |
| { |
| Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; |
| |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| vec3F p(0.0f); |
| |
| for (uint32_t i = 0; i < pCol_lists[y].n; i++) |
| p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; |
| |
| p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); |
| |
| pDst_block[x + y * 6] = p; |
| |
| } // x |
| } // y |
| } |
| } |
| |
| static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block) |
| { |
| vec3F temp_block[6][6]; // [y][x] |
| |
| // first filter rows to temp_block |
| if (grid_x == 6) |
| { |
| memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6); |
| } |
| else |
| { |
| Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; |
| |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| vec3F p(0.0f); |
| |
| for (uint32_t i = 0; i < pRow_lists[x].n; i++) |
| p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight; |
| |
| temp_block[y][x] = p; |
| } // x |
| } // y |
| } |
| |
| // filter columns |
| if (grid_y == 6) |
| { |
| memcpy((void *)pDst_block, temp_block, sizeof(vec3F) * 6 * 6); |
| } |
| else |
| { |
| Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; |
| |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| vec3F& p = pDst_block[x + y * 6]; |
| p.set(0.0f); |
| |
| for (uint32_t i = 0; i < pCol_lists[y].n; i++) |
| p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; |
| } // x |
| } // y |
| } |
| } |
| |
| static float diff_blocks(const vec4F* pA, const vec4F* pB) |
| { |
| const uint32_t BLOCK_T = 36; |
| |
| float diff = 0.0f; |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]); |
| |
| return diff * (1.0f / (float)BLOCK_T); |
| } |
| |
| static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB) |
| { |
| const uint32_t BLOCK_T = 36; |
| |
| vec3F mean(0.0f); |
| |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| vec3F diff(pA[i] - pB[i]); |
| mean += diff; |
| } |
| |
| mean *= (1.0f / (float)BLOCK_T); |
| |
| vec3F diff_sum(0.0f); |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| vec3F diff(pA[i] - pB[i]); |
| diff -= mean; |
| diff_sum += vec3F::component_mul(diff, diff); |
| } |
| |
| vec3F var(diff_sum * (1.0f / (float)BLOCK_T)); |
| |
| vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2])); |
| |
| return maximum(std_dev[0], std_dev[1], std_dev[2]); |
| } |
| |
| static void create_smooth_maps2( |
| vector2D<float>& smooth_block_mse_scales, |
| const image& orig_img, |
| smooth_map_params& params, image* pUltra_smooth_img = nullptr) |
| { |
| const uint32_t width = orig_img.get_width(); |
| const uint32_t height = orig_img.get_height(); |
| //const uint32_t total_pixels = orig_img.get_total_pixels(); |
| const uint32_t num_comps = 3; |
| |
| if (params.m_no_mse_scaling) |
| { |
| smooth_block_mse_scales.set_all(1.0f); |
| return; |
| } |
| |
| // TODO: - move up before the no mse scaling check (harmless as that is only a debug aid) |
| smooth_block_mse_scales.resize(width, height); |
| |
| image smooth_vis, med_smooth_vis, ultra_smooth_vis; |
| |
| if (params.m_debug_images) |
| { |
| smooth_vis.resize(width, height); |
| med_smooth_vis.resize(width, height); |
| ultra_smooth_vis.resize(width, height); |
| } |
| |
| for (uint32_t y = 0; y < height; y++) |
| { |
| for (uint32_t x = 0; x < width; x++) |
| { |
| { |
| tracked_stat_dbl comp_stats[4]; |
| for (int yd = -1; yd <= 1; yd++) |
| { |
| for (int xd = -1; xd <= 1; xd++) |
| { |
| const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); |
| |
| comp_stats[0].update((float)p[0]); |
| comp_stats[1].update((float)p[1]); |
| comp_stats[2].update((float)p[2]); |
| } |
| } |
| |
| float max_std_dev = 0.0f; |
| for (uint32_t i = 0; i < num_comps; i++) |
| max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); |
| |
| float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f); |
| //yl = powf(yl, 2.0f); |
| yl = powf(yl, 1.0f / 2.0f); // substantially less bits |
| |
| smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl); |
| |
| if (params.m_debug_images) |
| { |
| //smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255)); |
| // white=high local activity (edges/detail) |
| // black=low local activity (smooth - error is amplified) |
| smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255)); |
| } |
| } |
| |
| { |
| tracked_stat_dbl comp_stats[4]; |
| |
| const int S = 3; |
| for (int yd = -S; yd < S; yd++) |
| { |
| for (int xd = -S; xd < S; xd++) |
| { |
| const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); |
| |
| comp_stats[0].update((float)p[0]); |
| comp_stats[1].update((float)p[1]); |
| comp_stats[2].update((float)p[2]); |
| } |
| } |
| |
| float max_std_dev = 0.0f; |
| for (uint32_t i = 0; i < num_comps; i++) |
| max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); |
| |
| float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f); |
| //yl = powf(yl, 2.0f); |
| |
| smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl); |
| |
| if (params.m_debug_images) |
| med_smooth_vis(x, y).set((int)std::round(yl * 255.0f)); |
| } |
| |
| { |
| tracked_stat_dbl comp_stats[4]; |
| |
| const int S = 5; |
| for (int yd = -S; yd < S; yd++) |
| { |
| for (int xd = -S; xd < S; xd++) |
| { |
| const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); |
| |
| comp_stats[0].update((float)p[0]); |
| comp_stats[1].update((float)p[1]); |
| comp_stats[2].update((float)p[2]); |
| } |
| } |
| |
| float max_std_dev = 0.0f; |
| for (uint32_t i = 0; i < num_comps; i++) |
| max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); |
| |
| float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f); |
| yl = powf(yl, 2.0f); |
| |
| smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl); |
| |
| if (params.m_debug_images) |
| ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f)); |
| } |
| |
| } |
| } |
| |
| if (params.m_debug_images) |
| { |
| save_png("dbg_smooth_vis.png", smooth_vis); |
| save_png("dbg_med_smooth_vis.png", med_smooth_vis); |
| save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis); |
| |
| image vis_img(width, height); |
| |
| float max_scale = 0.0f; |
| for (uint32_t y = 0; y < height; y++) |
| for (uint32_t x = 0; x < width; x++) |
| max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y)); |
| |
| for (uint32_t y = 0; y < height; y++) |
| for (uint32_t x = 0; x < width; x++) |
| vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale)); |
| |
| save_png("scale_vis.png", vis_img); |
| } |
| |
| if (pUltra_smooth_img) |
| *pUltra_smooth_img = ultra_smooth_vis; |
| } |
| |
| const float REALLY_DARK_I_THRESHOLD = 0.0625f; |
| const float REALLY_DARK_MSE_ERR_SCALE = 128.0f; |
| const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f; |
| |
| static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment) |
| { |
| float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0]; |
| float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1]; |
| float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2]; |
| |
| float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p); |
| |
| if (delta_itp_dark_adjustment) |
| { |
| // We have to process a large range of inputs, including extremely dark inputs. |
| // Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas. |
| // This is to better handle very dark signals which could be explictly overexposed. |
| float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]); |
| s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s); |
| err *= s; |
| } |
| |
| return err; |
| } |
| |
| static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment) |
| { |
| float total_mse = 0.0f; |
| |
| for (uint32_t y = 0; y < block_h; y++) |
| { |
| for (uint32_t x = 0; x < block_w; x++) |
| { |
| total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment); |
| } // x |
| } // y |
| |
| return total_mse * (1.0f / (float)(block_w * block_h)); |
| } |
| |
| static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp) |
| { |
| const uint32_t n = block_w * block_h; |
| assert(n <= 36); |
| |
| stats<float> x_stats[3], y_stats[3]; |
| comparative_stats<float> xy_cov[3]; |
| |
| for (uint32_t c = 0; c < 3; c++) |
| { |
| x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3); |
| y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3); |
| } |
| |
| for (uint32_t c = 0; c < 3; c++) |
| xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]); |
| |
| float ssim[3]; |
| const double d = 1.0f, k1 = .01f, k2 = .03f; |
| |
| // weight mean error more highly to reduce blocking |
| float ap = 1.5f, bp = 1.0f, cp = 1.0f; |
| |
| const double s_c1 = square(k1 * d), s_c2 = square(k2 * d); |
| const double s_c3(s_c2 * .5f); |
| |
| for (uint32_t c = 0; c < 3; c++) |
| { |
| float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1)); |
| lum = saturate(lum); |
| |
| float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2)); |
| con = saturate(con); |
| |
| float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3)); |
| str = saturate(str); |
| |
| ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp); |
| } |
| |
| #if 0 |
| float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f); |
| #elif 1 |
| float final_ssim = ssim[0] * ssim[1] * ssim[2]; |
| #else |
| const float LP = .75f; |
| float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP); |
| #endif |
| |
| return final_ssim; |
| } |
| |
| // delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light |
| static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment) |
| { |
| float delta_i = a[0] - b[0]; |
| float delta_t = a[1] - b[1]; |
| float delta_p = a[2] - b[2]; |
| |
| float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p)); |
| |
| float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]); |
| |
| if (delta_itp_dark_adjustment) |
| { |
| // This is to better handle very dark signals which could be explictly overexposed. |
| s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s); |
| err *= s; |
| } |
| |
| return err; |
| } |
| |
| struct candidate_encoding |
| { |
| encoding_type m_encoding_type; |
| |
| basist::half_float m_solid_color[3]; |
| |
| uint32_t m_run_len; |
| |
| vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x] |
| vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x] |
| |
| endpoint_mode m_endpoint_mode; |
| block_mode m_block_mode; |
| |
| bitwise_coder m_coder; |
| |
| // The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC. |
| // Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type. |
| astc_helpers::log_astc_block m_coded_log_blk; |
| |
| // The block the decoder outputs. |
| astc_helpers::log_astc_block m_decomp_log_blk; |
| |
| int m_reuse_delta_index; |
| |
| // m_t can get VERY large |
| double m_t, m_d; |
| float m_bits; |
| |
| candidate_encoding() |
| { |
| clear(); |
| } |
| |
| candidate_encoding(const candidate_encoding &other) |
| { |
| *this = other; |
| } |
| |
| candidate_encoding(candidate_encoding&& other) |
| { |
| *this = std::move(other); |
| } |
| |
| candidate_encoding& operator=(const candidate_encoding& rhs) |
| { |
| if (this == &rhs) |
| return *this; |
| |
| m_encoding_type = rhs.m_encoding_type; |
| memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color)); |
| m_run_len = rhs.m_run_len; |
| memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels)); |
| m_endpoint_mode = rhs.m_endpoint_mode; |
| m_block_mode = rhs.m_block_mode; |
| m_coder = rhs.m_coder; |
| m_coded_log_blk = rhs.m_coded_log_blk; |
| m_decomp_log_blk = rhs.m_decomp_log_blk; |
| m_reuse_delta_index = rhs.m_reuse_delta_index; |
| |
| return *this; |
| } |
| |
| candidate_encoding& operator=(candidate_encoding&& rhs) |
| { |
| if (this == &rhs) |
| return *this; |
| |
| m_encoding_type = rhs.m_encoding_type; |
| memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color)); |
| m_run_len = rhs.m_run_len; |
| memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels)); |
| m_endpoint_mode = rhs.m_endpoint_mode; |
| m_block_mode = rhs.m_block_mode; |
| m_coder = std::move(rhs.m_coder); |
| m_coded_log_blk = rhs.m_coded_log_blk; |
| m_decomp_log_blk = rhs.m_decomp_log_blk; |
| m_reuse_delta_index = rhs.m_reuse_delta_index; |
| |
| return *this; |
| } |
| |
| void clear() |
| { |
| m_encoding_type = encoding_type::cInvalid; |
| |
| clear_obj(m_solid_color); |
| |
| m_run_len = 0; |
| |
| clear_obj(m_comp_pixels); |
| |
| m_endpoint_mode = endpoint_mode::cInvalid; |
| m_block_mode = block_mode::cInvalid; |
| |
| m_coder.restart(); |
| |
| m_coded_log_blk.clear(); |
| m_decomp_log_blk.clear(); |
| |
| m_t = 0; |
| m_d = 0; |
| m_bits = 0; |
| |
| m_reuse_delta_index = 0; |
| } |
| }; |
| |
| bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels) |
| { |
| assert((block_w <= 6) && (block_h <= 6)); |
| |
| half_vec4 decoded_pixels_half4[6 * 6]; // [y][x] |
| bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16); |
| assert(status); |
| |
| if (!status) |
| return false; |
| |
| for (uint32_t y = 0; y < block_h; y++) |
| { |
| for (uint32_t x = 0; x < block_w; x++) |
| { |
| pPixels[x + y * block_w].set( |
| basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]), |
| basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]), |
| basist::half_to_float(decoded_pixels_half4[x + y * block_w][2])); |
| } // x |
| } //y |
| |
| return true; |
| } |
| |
| static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk) |
| { |
| astc_helpers::astc_block phys_blk; |
| return astc_helpers::pack_astc_block(phys_blk, decomp_blk); |
| } |
| |
| #define SYNC_MARKERS (0) |
| |
| static bool decode_file(const uint8_vec& comp_data, vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t &width, uint32_t &height) |
| { |
| interval_timer tm; |
| tm.start(); |
| |
| const uint32_t BLOCK_W = 6, BLOCK_H = 6; |
| |
| width = 0; |
| height = 0; |
| |
| if (comp_data.size() <= 2*3) |
| return false; |
| |
| basist::bitwise_decoder decoder; |
| if (!decoder.init(comp_data.data(), comp_data.size_u32())) |
| return false; |
| |
| // Read initial LE marker |
| const uint32_t marker = decoder.get_bits(16); |
| |
| // Check for v1.60 and v2.0 markers - if it's not either, it's not valid data. |
| if ((marker != UASTC_6x6_HDR_SIG0) && (marker != UASTC_6x6_HDR_SIG1)) |
| return false; |
| |
| // Use original v1.60 behavior for tiny weight grid upsampling if it's the original marker, otherwise v2.0. |
| const bool use_orig_behavior = (marker == UASTC_6x6_HDR_SIG0); |
| |
| width = decoder.get_bits(16); |
| height = decoder.get_bits(16); |
| |
| if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM)) |
| return false; |
| |
| const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W; |
| const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H; |
| const uint32_t total_blocks = num_blocks_x * num_blocks_y; |
| |
| decoded_blocks.resize(num_blocks_x, num_blocks_y); |
| //memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes()); |
| |
| vector2D<astc_helpers::log_astc_block> decoded_log_blocks(num_blocks_x, num_blocks_y); |
| //memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes()); |
| |
| uint32_t cur_bx = 0, cur_by = 0; |
| uint32_t step_counter = 0; |
| BASISU_NOTE_UNUSED(step_counter); |
| |
| while (cur_by < num_blocks_y) |
| { |
| step_counter++; |
| |
| //if ((cur_bx == 9) && (cur_by == 13)) |
| // printf("!"); |
| |
| #if SYNC_MARKERS |
| uint32_t mk = decoder.get_bits(16); |
| if (mk != 0xDEAD) |
| { |
| printf("!"); |
| assert(0); |
| return false; |
| } |
| #endif |
| if (decoder.get_bits_remaining() < 1) |
| return false; |
| |
| encoding_type et = encoding_type::cBlock; |
| |
| uint32_t b0 = decoder.get_bits(1); |
| if (!b0) |
| { |
| uint32_t b1 = decoder.get_bits(1); |
| if (b1) |
| et = encoding_type::cReuse; |
| else |
| { |
| uint32_t b2 = decoder.get_bits(1); |
| if (b2) |
| et = encoding_type::cSolid; |
| else |
| et = encoding_type::cRun; |
| } |
| } |
| |
| switch (et) |
| { |
| case encoding_type::cRun: |
| { |
| if (!cur_bx && !cur_by) |
| return false; |
| |
| const uint32_t run_len = decoder.decode_vlc(5) + 1; |
| |
| uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x); |
| if (run_len > num_blocks_remaining) |
| return false; |
| |
| uint32_t prev_bx = cur_bx, prev_by = cur_by; |
| |
| if (cur_bx) |
| prev_bx--; |
| else |
| { |
| prev_bx = num_blocks_x - 1; |
| prev_by--; |
| } |
| |
| const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by); |
| const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by); |
| |
| for (uint32_t i = 0; i < run_len; i++) |
| { |
| decoded_log_blocks(cur_bx, cur_by) = prev_log_blk; |
| decoded_blocks(cur_bx, cur_by) = prev_phys_blk; |
| |
| cur_bx++; |
| if (cur_bx == num_blocks_x) |
| { |
| cur_bx = 0; |
| cur_by++; |
| } |
| } |
| |
| break; |
| } |
| case encoding_type::cSolid: |
| { |
| const basist::half_float rh = (basist::half_float)decoder.get_bits(15); |
| const basist::half_float gh = (basist::half_float)decoder.get_bits(15); |
| const basist::half_float bh = (basist::half_float)decoder.get_bits(15); |
| |
| astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); |
| |
| log_blk.clear(); |
| log_blk.m_solid_color_flag_hdr = true; |
| log_blk.m_solid_color[0] = rh; |
| log_blk.m_solid_color[1] = gh; |
| log_blk.m_solid_color[2] = bh; |
| log_blk.m_solid_color[3] = basist::float_to_half(1.0f); |
| |
| bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk); |
| if (!status) |
| return false; |
| |
| cur_bx++; |
| if (cur_bx == num_blocks_x) |
| { |
| cur_bx = 0; |
| cur_by++; |
| } |
| |
| break; |
| } |
| case encoding_type::cReuse: |
| { |
| if (!cur_bx && !cur_by) |
| return false; |
| |
| const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS); |
| |
| const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x; |
| const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y; |
| |
| const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y; |
| if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x)) |
| return false; |
| if (prev_by < 0) |
| return false; |
| |
| const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by); |
| const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by); |
| |
| if (prev_log_blk.m_solid_color_flag_hdr) |
| return false; |
| |
| astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); |
| astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); |
| |
| log_blk = prev_log_blk; |
| |
| const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1); |
| |
| bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights); |
| if (!status) |
| return false; |
| |
| astc_helpers::log_astc_block decomp_blk; |
| status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H); |
| if (!status) |
| return false; |
| |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; |
| basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range); |
| |
| copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk, use_orig_behavior); |
| |
| status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); |
| if (!status) |
| return false; |
| |
| cur_bx++; |
| if (cur_bx == num_blocks_x) |
| { |
| cur_bx = 0; |
| cur_by++; |
| } |
| |
| break; |
| } |
| case encoding_type::cBlock: |
| { |
| const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes); |
| const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal); |
| |
| switch (em) |
| { |
| case endpoint_mode::cUseLeft: |
| case endpoint_mode::cUseUpper: |
| { |
| int neighbor_bx = cur_bx, neighbor_by = cur_by; |
| |
| if (em == endpoint_mode::cUseLeft) |
| neighbor_bx--; |
| else |
| neighbor_by--; |
| |
| if ((neighbor_bx < 0) || (neighbor_by < 0)) |
| return false; |
| |
| const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by); |
| if (!neighbor_blk.m_color_endpoint_modes[0]) |
| return false; |
| |
| const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; |
| const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); |
| |
| if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0]) |
| return false; |
| |
| astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); |
| astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); |
| |
| log_blk.clear(); |
| log_blk.m_num_partitions = 1; |
| log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; |
| log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range; |
| log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; |
| log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; |
| log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; |
| log_blk.m_dual_plane = (uint8_t)bmd.m_dp; |
| log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; |
| |
| memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values); |
| |
| const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); |
| |
| bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); |
| if (!status) |
| return false; |
| |
| astc_helpers::log_astc_block decomp_blk; |
| decomp_blk.clear(); |
| |
| decomp_blk.m_num_partitions = 1; |
| decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; |
| decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; |
| decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; |
| decomp_blk.m_dual_plane = bmd.m_dp; |
| decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; |
| |
| basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); |
| |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; |
| basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); |
| |
| copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior); |
| |
| status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); |
| if (!status) |
| return false; |
| |
| cur_bx++; |
| if (cur_bx == num_blocks_x) |
| { |
| cur_bx = 0; |
| cur_by++; |
| } |
| |
| break; |
| } |
| case endpoint_mode::cUseLeftDelta: |
| case endpoint_mode::cUseUpperDelta: |
| { |
| int neighbor_bx = cur_bx, neighbor_by = cur_by; |
| |
| if (em == endpoint_mode::cUseLeftDelta) |
| neighbor_bx--; |
| else |
| neighbor_by--; |
| |
| if ((neighbor_bx < 0) || (neighbor_by < 0)) |
| return false; |
| |
| const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by); |
| if (!neighbor_blk.m_color_endpoint_modes[0]) |
| return false; |
| |
| const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; |
| const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); |
| |
| if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0]) |
| return false; |
| |
| astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); |
| astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); |
| |
| log_blk.clear(); |
| log_blk.m_num_partitions = 1; |
| log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; |
| log_blk.m_dual_plane = bmd.m_dp; |
| log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; |
| |
| log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range; |
| basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints); |
| |
| const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS; |
| const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1; |
| |
| const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank; |
| const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE; |
| const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range); |
| |
| for (uint32_t i = 0; i < num_endpoint_values; i++) |
| { |
| int cur_val = ise_to_rank[log_blk.m_endpoints[i]]; |
| |
| int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit; |
| |
| cur_val += delta; |
| if ((cur_val < 0) || (cur_val >= total_endpoint_levels)) |
| return false; |
| |
| log_blk.m_endpoints[i] = rank_to_ise[cur_val]; |
| } |
| |
| log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; |
| log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; |
| log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; |
| |
| const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); |
| |
| bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); |
| if (!status) |
| return false; |
| |
| astc_helpers::log_astc_block decomp_blk; |
| decomp_blk.clear(); |
| |
| decomp_blk.m_num_partitions = 1; |
| decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; |
| decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; |
| decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; |
| decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp; |
| decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; |
| |
| basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); |
| |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; |
| basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); |
| |
| copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior); |
| |
| status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); |
| if (!status) |
| return false; |
| |
| cur_bx++; |
| if (cur_bx == num_blocks_x) |
| { |
| cur_bx = 0; |
| cur_by++; |
| } |
| |
| break; |
| } |
| case endpoint_mode::cRaw: |
| { |
| const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; |
| |
| const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); |
| |
| astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); |
| astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); |
| |
| log_blk.clear(); |
| log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; |
| |
| for (uint32_t p = 0; p < bmd.m_num_partitions; p++) |
| log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; |
| |
| log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range; |
| log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; |
| |
| log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; |
| log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; |
| log_blk.m_dual_plane = (uint8_t)bmd.m_dp; |
| log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; |
| |
| if (bmd.m_num_partitions == 2) |
| { |
| const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2); |
| log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index]; |
| } |
| else if (bmd.m_num_partitions == 3) |
| { |
| const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3); |
| log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index]; |
| } |
| |
| bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints); |
| if (!status) |
| return false; |
| |
| const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); |
| |
| status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); |
| if (!status) |
| return false; |
| |
| astc_helpers::log_astc_block decomp_blk; |
| decomp_blk.clear(); |
| |
| decomp_blk.m_dual_plane = bmd.m_dp; |
| decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; |
| decomp_blk.m_partition_id = log_blk.m_partition_id; |
| |
| decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; |
| |
| for (uint32_t p = 0; p < bmd.m_num_partitions; p++) |
| decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; |
| |
| decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; |
| decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; |
| |
| for (uint32_t p = 0; p < bmd.m_num_partitions; p++) |
| basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p); |
| |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; |
| basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); |
| |
| copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior); |
| |
| status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); |
| if (!status) |
| return false; |
| |
| cur_bx++; |
| if (cur_bx == num_blocks_x) |
| { |
| cur_bx = 0; |
| cur_by++; |
| } |
| |
| break; |
| } |
| default: |
| { |
| assert(0); |
| return false; |
| } |
| } |
| |
| break; |
| } |
| default: |
| { |
| assert(0); |
| return false; |
| } |
| } |
| } |
| |
| if (decoder.get_bits(16) != 0xA742) |
| { |
| fmt_error_printf("End marker not found!\n"); |
| return false; |
| } |
| |
| //fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs()); |
| |
| return true; |
| } |
| |
| static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels) |
| { |
| astc_helpers::log_astc_block log_blk; |
| if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height)) |
| return false; |
| |
| basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4]; |
| if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16)) |
| return false; |
| |
| const uint32_t total_block_pixels = block_width * block_height; |
| for (uint32_t p = 0; p < total_block_pixels; p++) |
| { |
| pPixels[p][0] = basist::half_to_float(half_block[p][0]); |
| pPixels[p][1] = basist::half_to_float(half_block[p][1]); |
| pPixels[p][2] = basist::half_to_float(half_block[p][2]); |
| pPixels[p][3] = basist::half_to_float(half_block[p][3]); |
| } |
| |
| return true; |
| } |
| |
| static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels) |
| { |
| return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height); |
| } |
| |
| static bool pack_bc6h_image(const imagef &src_img, vector2D<basist::bc6h_block> &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params) |
| { |
| const uint32_t width = src_img.get_width(); |
| const uint32_t height = src_img.get_height(); |
| |
| if (pPacked_bc6h_img) |
| pPacked_bc6h_img->resize(width, height); |
| |
| interval_timer tm; |
| double total_enc_time = 0.0f; |
| BASISU_NOTE_UNUSED(total_enc_time); |
| |
| const uint32_t num_blocks_x = src_img.get_block_width(4); |
| const uint32_t num_blocks_y = src_img.get_block_height(4); |
| |
| bc6h_blocks.resize(num_blocks_x, num_blocks_y); |
| |
| for (uint32_t by = 0; by < num_blocks_y; by++) |
| { |
| for (uint32_t bx = 0; bx < num_blocks_x; bx++) |
| { |
| // Extract source image block |
| vec4F block_pixels[4][4]; // [y][x] |
| src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4); |
| |
| basist::half_float half_pixels[16 * 3]; // [y][x] |
| |
| for (uint32_t y = 0; y < 4; y++) |
| { |
| for (uint32_t x = 0; x < 4; x++) |
| { |
| for (uint32_t c = 0; c < 3; c++) |
| { |
| float v = block_pixels[y][x][c]; |
| |
| basist::half_float h = basist::float_to_half(v); |
| |
| half_pixels[(x + y * 4) * 3 + c] = h; |
| |
| } // c |
| |
| } // x |
| } // y |
| |
| basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by); |
| |
| tm.start(); |
| |
| basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params); |
| |
| total_enc_time += tm.get_elapsed_secs(); |
| |
| if (pPacked_bc6h_img) |
| { |
| basist::half_float unpacked_blk[16 * 3]; |
| bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false); |
| assert(status); |
| if (!status) |
| { |
| fmt_error_printf("unpack_bc6h() failed\n"); |
| return false; |
| } |
| |
| for (uint32_t y = 0; y < 4; y++) |
| { |
| for (uint32_t x = 0; x < 4; x++) |
| { |
| vec4F p; |
| |
| for (uint32_t c = 0; c < 3; c++) |
| { |
| float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]); |
| p[c] = v; |
| |
| } // c |
| |
| p[3] = 1.0f; |
| |
| pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p); |
| } // x |
| } // y |
| } |
| |
| } // bx |
| } // by |
| |
| //fmt_printf("Total BC6H encode time: {}\n", total_enc_time); |
| |
| return true; |
| } |
| |
| static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir) |
| { |
| vec3F q(p - line_org); |
| vec3F v(q - q.dot(line_dir) * line_dir); |
| return v.dot(v); |
| } |
| |
| static void estimate_partitions_mode7_and_11( |
| uint32_t num_parts, // 2 or 3 partitions |
| uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns |
| uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine |
| const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats |
| const astc_hdr_codec_base_options& coptions, // options |
| uint32_t num_desired_pats, |
| int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices |
| { |
| BASISU_NOTE_UNUSED(coptions); |
| BASISU_NOTE_UNUSED(num_unique_pats); |
| |
| const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 6 |
| assert(num_parts <= MAX_PARTS); |
| |
| struct candidate_res |
| { |
| float m_total_sq_dist; |
| uint32_t m_index; |
| bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; } |
| }; |
| |
| const uint32_t MAX_CANDIDATES = 1024; |
| assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES)); |
| |
| candidate_res mode11_candidates[MAX_CANDIDATES]; |
| candidate_res mode7_candidates[MAX_CANDIDATES]; |
| |
| const vec3F grayscale_axis(0.5773502691f); |
| |
| for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++) |
| { |
| const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter]; |
| assert(unique_part_index < num_unique_pats); |
| |
| const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index]; |
| |
| vec3F part_means[MAX_PARTS]; |
| uint32_t part_total_texels[MAX_PARTS] = { 0 }; |
| |
| for (uint32_t i = 0; i < num_parts; i++) |
| part_means[i].clear(); |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = (*pPat)(x, y); |
| assert(part_index < num_parts); |
| |
| part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W]; |
| part_total_texels[part_index]++; |
| |
| } // x |
| } // y |
| |
| for (uint32_t i = 0; i < num_parts; i++) |
| { |
| assert(part_total_texels[i]); |
| part_means[i] /= (float)part_total_texels[i]; |
| } |
| |
| float part_cov[MAX_PARTS][6]; |
| memset(part_cov, 0, sizeof(part_cov)); |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = (*pPat)(x, y); |
| assert(part_index < num_parts); |
| |
| const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]); |
| |
| const float r = p[0], g = p[1], b = p[2]; |
| |
| part_cov[part_index][0] += r * r; |
| part_cov[part_index][1] += r * g; |
| part_cov[part_index][2] += r * b; |
| part_cov[part_index][3] += g * g; |
| part_cov[part_index][4] += g * b; |
| part_cov[part_index][5] += b * b; |
| |
| } // x |
| } // y |
| |
| // For each partition compute the total variance of all channels. |
| float total_variance[MAX_PARTS]; |
| for (uint32_t part_index = 0; part_index < num_parts; part_index++) |
| total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5]; |
| |
| //vec3F part_axis[MAX_PARTS]; |
| float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis |
| float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis |
| |
| for (uint32_t part_index = 0; part_index < num_parts; part_index++) |
| { |
| float* pCov = &part_cov[part_index][0]; |
| |
| float xr = .9f, xg = 1.0f, xb = .7f; |
| |
| const uint32_t NUM_POWER_ITERS = 4; |
| for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++) |
| { |
| float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2]; |
| float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4]; |
| float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5]; |
| |
| float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); |
| |
| if (m >= 1e-10f) |
| { |
| m = 1.0f / m; |
| |
| r *= m; |
| g *= m; |
| b *= m; |
| } |
| |
| xr = r; |
| xg = g; |
| xb = b; |
| } |
| |
| float len_sq = xr * xr + xg * xg + xb * xb; |
| |
| if (len_sq < 1e-10f) |
| { |
| xr = grayscale_axis[0]; |
| xg = grayscale_axis[0]; |
| xb = grayscale_axis[0]; |
| } |
| else |
| { |
| len_sq = 1.0f / sqrtf(len_sq); |
| |
| xr *= len_sq; |
| xg *= len_sq; |
| xb *= len_sq; |
| } |
| |
| { |
| // Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis). |
| float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2]; |
| float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4]; |
| float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5]; |
| |
| // Estimate the principle eigenvalue by computing the magnitude of the transformed vector. |
| // The result is the variance along the principle axis. |
| //float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis |
| //float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb |
| |
| mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb; |
| } |
| |
| { |
| const float yrgb = grayscale_axis[0]; |
| |
| // Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector). |
| float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2]; |
| float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4]; |
| float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5]; |
| |
| mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb; |
| } |
| |
| } // part_index |
| |
| // Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis. |
| // TODO: Could also compute the ratio of the principle axis's variance vs. the total variance. |
| float mode11_total_sq_dist_to_line_alt = 0.0f; |
| for (uint32_t part_index = 0; part_index < num_parts; part_index++) |
| { |
| float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]); |
| mode11_total_sq_dist_to_line_alt += d; |
| } |
| |
| { |
| #if 0 |
| // TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix), |
| // then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances. |
| float total_sq_dist_to_line = 0.0f; |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| const uint32_t part_index = (*pPat)[i]; |
| assert(part_index < num_parts); |
| |
| total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]); |
| } |
| |
| mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line; |
| #else |
| mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt; |
| #endif |
| mode11_candidates[examine_iter].m_index = unique_part_index; |
| } |
| |
| { |
| float mode7_total_sq_dist_to_line_alt = 0.0f; |
| for (uint32_t part_index = 0; part_index < num_parts; part_index++) |
| { |
| float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]); |
| mode7_total_sq_dist_to_line_alt += d; |
| } |
| |
| mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt; |
| mode7_candidates[examine_iter].m_index = unique_part_index; |
| } |
| |
| } // examine_iter |
| |
| std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]); |
| std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]); |
| |
| for (uint32_t i = 0; i < num_desired_pats; i++) |
| pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index; |
| |
| for (uint32_t i = 0; i < num_desired_pats; i++) |
| pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index; |
| } |
| |
| static void estimate_partitions_mode7( |
| uint32_t num_parts, // 2 or 3 partitions |
| uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns |
| uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine |
| const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats |
| const astc_hdr_codec_base_options& coptions, // options |
| uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices |
| { |
| BASISU_NOTE_UNUSED(coptions); |
| BASISU_NOTE_UNUSED(num_unique_pats); |
| |
| const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3; |
| assert(num_parts <= MAX_PARTS); |
| |
| struct candidate_res |
| { |
| float m_total_sq_dist; |
| uint32_t m_index; |
| bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; } |
| }; |
| |
| const uint32_t MAX_CANDIDATES = 1024; |
| assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES)); |
| |
| candidate_res candidates[MAX_CANDIDATES]; |
| |
| for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++) |
| { |
| const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter]; |
| assert(unique_part_index < num_unique_pats); |
| |
| const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index]; |
| |
| vec3F part_means[MAX_PARTS]; |
| uint32_t part_total_texels[MAX_PARTS] = { 0 }; |
| |
| for (uint32_t i = 0; i < num_parts; i++) |
| part_means[i].clear(); |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = (*pPat)(x, y); |
| assert(part_index < num_parts); |
| |
| part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W]; |
| part_total_texels[part_index]++; |
| |
| } // x |
| } // y |
| |
| for (uint32_t i = 0; i < num_parts; i++) |
| { |
| assert(part_total_texels[i]); |
| part_means[i] /= (float)part_total_texels[i]; |
| } |
| |
| vec3F part_axis(0.5773502691f); |
| |
| // TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix), |
| // then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances. |
| float total_sq_dist_to_line = 0.0f; |
| for (uint32_t i = 0; i < BLOCK_T; i++) |
| { |
| const uint32_t part_index = (*pPat)[i]; |
| assert(part_index < num_parts); |
| |
| total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis); |
| } |
| |
| candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line; |
| |
| candidates[examine_iter].m_index = unique_part_index; |
| |
| } // examine_iter |
| |
| std::sort(&candidates[0], &candidates[num_pats_to_examine]); |
| |
| for (uint32_t i = 0; i < num_desired_pats; i++) |
| pDesired_pat_indices[i] = candidates[i].m_index; |
| } |
| |
| static float calc_deblocking_penalty_itp( |
| uint32_t bx, uint32_t by, uint32_t width, uint32_t height, |
| const imagef& pass_src_img_itp, const candidate_encoding& candidate) |
| { |
| float total_deblock_penalty = 0.0f; |
| |
| float total_orig_mse = 0.0f, total_comp_mse = 0.0f; |
| uint32_t total_c = 0; |
| |
| for (uint32_t b = 0; b < 4; b++) |
| { |
| for (uint32_t i = 0; i < 6; i++) |
| { |
| int ox = 0, oy = 0, qx = 0, qy = 0; |
| |
| switch (b) |
| { |
| case 0: |
| ox = bx * 6 + i; oy = (by - 1) * 6 + 5; |
| qx = bx * 6 + i; qy = by * 6; |
| break; |
| case 1: |
| ox = bx * 6 + i; oy = (by + 1) * 6; |
| qx = bx * 6 + i; qy = by * 6 + 5; |
| break; |
| case 2: |
| ox = (bx - 1) * 6 + 5; oy = by * 6 + i; |
| qx = bx * 6; qy = by * 6 + i; |
| break; |
| case 3: |
| ox = (bx + 1) * 6; oy = by * 6 + i; |
| qx = bx * 6 + 5; qy = by * 6 + i; |
| break; |
| } |
| |
| if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height)) |
| continue; |
| |
| const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy); |
| const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy); |
| |
| const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block |
| |
| vec3F orig_delta_v(o_pixel_itp - q_pixel_itp); |
| total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]); |
| |
| vec3F d_delta_v(o_pixel_itp - d_pixel_itp); |
| total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]); |
| |
| total_c++; |
| } |
| } |
| |
| if (total_c) |
| { |
| total_orig_mse /= (float)total_c; |
| total_comp_mse /= (float)total_c; |
| |
| if (total_orig_mse) |
| { |
| total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse); |
| } |
| } |
| |
| return total_deblock_penalty; |
| } |
| |
| static bool calc_strip_size( |
| float lambda, |
| uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip, |
| uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg) |
| { |
| uint32_t total_strips = 1; |
| |
| if (lambda == 0.0f) |
| { |
| if (!force_one_strip) |
| { |
| total_strips = total_threads; |
| } |
| } |
| else |
| { |
| const uint32_t MIN_DESIRED_STRIPS = 8; |
| const uint32_t MAX_TARGET_STRIPS = 32; |
| const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12; |
| |
| if (!force_one_strip) |
| { |
| total_strips = maximum<uint32_t>(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP); |
| |
| if (num_blocks_y >= MIN_DESIRED_STRIPS * 2) |
| total_strips = maximum(total_strips, MIN_DESIRED_STRIPS); |
| } |
| |
| total_strips = minimum(total_strips, MAX_TARGET_STRIPS); |
| } |
| |
| uint32_t rows_per_strip = 0; |
| if (total_strips <= 1) |
| { |
| rows_per_strip = num_blocks_y; |
| } |
| else |
| { |
| rows_per_strip = (num_blocks_y / total_strips) & ~1; |
| |
| if (rows_per_strip < 2) |
| rows_per_strip = 2;// num_blocks_y; |
| } |
| |
| assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0)); |
| |
| total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip; |
| |
| if (global_cfg.m_debug_output) |
| { |
| fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips); |
| fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip); |
| fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip); |
| } |
| |
| uint32_t total_rows = 0; |
| for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++) |
| { |
| uint32_t strip_first_by = strip_index * rows_per_strip; |
| uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y); |
| |
| if (strip_index == (total_strips - 1)) |
| strip_last_by = num_blocks_y - 1; |
| |
| uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1; |
| total_rows += num_strip_block_rows; |
| |
| if (global_cfg.m_debug_output) |
| fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows); |
| } |
| |
| if (total_rows != num_blocks_y) |
| { |
| fmt_error_printf("Strip calc failed\n"); |
| return false; |
| } |
| |
| res_total_strips = total_strips; |
| res_rows_per_strip = rows_per_strip; |
| |
| return true; |
| } |
| |
| static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg) |
| { |
| const uint32_t width = src_img.get_width(), height = src_img.get_height(); |
| |
| dst_img.resize(width, height); |
| |
| for (uint32_t y = 0; y < height; y++) |
| { |
| for (uint32_t x = 0; x < width; x++) |
| { |
| vec3F src_rgb(src_img(x, y)); |
| |
| vec3F src_itp; |
| linear_rgb_to_itp(src_rgb, src_itp, cfg); |
| |
| dst_img(x, y) = src_itp; |
| } |
| } |
| } |
| |
| const uint32_t BLOCK_W = 6, BLOCK_H = 6; |
| const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H; |
| |
| const float SOLID_PENALTY = 4.0f; |
| const float REUSE_PENALTY = 1.0f; |
| const float RUN_PENALTY = 10.0f; |
| |
| const float MSE_WEIGHT = 300000.0f; |
| const float SSIM_WEIGHT = 200.0f; |
| const float TWO_LEVEL_PENALTY = 1.425f; |
| const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f; |
| const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f; |
| const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f; |
| const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f; |
| const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f; |
| |
| struct uastc_hdr_6x6_debug_state |
| { |
| uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 }; |
| uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 }; |
| uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 }; |
| uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 }; |
| |
| basisu::vector< basisu::stats<float> > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3]; |
| basisu::vector< basisu::comparative_stats<float> > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3]; |
| |
| std::atomic<uint32_t> m_total_gaussian1_blocks; |
| std::atomic<uint32_t> m_total_gaussian2_blocks; |
| std::atomic<uint32_t> m_total_filter_horizontal; |
| std::atomic<uint32_t> m_detail_stats[5]; |
| std::atomic<uint32_t> m_total_mode7_skips; |
| |
| std::atomic<uint32_t> m_total_blocks_compressed; |
| |
| std::atomic<uint32_t> m_total_candidates_considered; |
| std::atomic<uint32_t> m_max_candidates_considered; |
| |
| std::atomic<uint32_t> m_total_part2_stats[4]; |
| std::atomic<uint32_t> m_dp_stats[5]; |
| |
| std::atomic<uint32_t> m_reuse_num_parts[4]; |
| std::atomic<uint32_t> m_reuse_total_dp; |
| |
| imagef m_stat_vis; |
| std::mutex m_stat_vis_mutex; |
| |
| image m_part_vis; |
| image m_mode_vis; |
| image m_mode_vis2; |
| image m_grid_vis; |
| image m_enc_vis; |
| std::mutex m_vis_image_mutex; |
| |
| std::atomic<uint32_t> m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1]; |
| |
| std::atomic<uint32_t> m_total_jnd_replacements; |
| |
| std::mutex m_stats_mutex; |
| |
| uastc_hdr_6x6_debug_state() |
| { |
| for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) |
| { |
| for (uint32_t j = 0; j < 3; j++) |
| { |
| m_block_mode_comp_stats[i][j].reserve(512); |
| m_block_mode_comparative_stats[i][j].reserve(512); |
| } |
| } |
| } |
| |
| void init(uint32_t width, uint32_t height) |
| { |
| m_stat_vis.resize(width, height); |
| m_part_vis.resize(width, height); |
| m_mode_vis.resize(width, height); |
| m_mode_vis2.resize(width, height); |
| m_grid_vis.resize(width, height); |
| m_enc_vis.resize(width, height); |
| |
| basisu::clear_obj(m_encoding_type_hist); |
| basisu::clear_obj(m_endpoint_mode_hist); |
| basisu::clear_obj(m_block_mode_hist); |
| basisu::clear_obj(m_block_mode_total_bits); |
| |
| for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) |
| { |
| for (uint32_t j = 0; j < 3; j++) |
| { |
| m_block_mode_comp_stats[i][j].clear(); |
| m_block_mode_comparative_stats[i][j].clear(); |
| } |
| } |
| |
| m_total_gaussian1_blocks.store(0); |
| m_total_gaussian2_blocks.store(0); |
| m_total_filter_horizontal.store(0); |
| for (uint32_t i = 0; i < std::size(m_detail_stats); i++) |
| m_detail_stats[i].store(0); |
| m_total_mode7_skips.store(0); |
| |
| for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++) |
| m_comp_level_hist[i].store(0); |
| |
| m_total_blocks_compressed.store(0); |
| |
| m_total_candidates_considered.store(0); |
| m_max_candidates_considered.store(0); |
| |
| for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++) |
| m_total_part2_stats[i].store(0); |
| |
| for (uint32_t i = 0; i < std::size(m_dp_stats); i++) |
| m_dp_stats[i].store(0); |
| |
| for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++) |
| m_reuse_num_parts[i] .store(0); |
| |
| m_reuse_total_dp.store(0); |
| |
| m_total_jnd_replacements.store(0); |
| } |
| |
| void print(uint32_t total_blocks) const |
| { |
| fmt_printf("Total blocks: {}\n", total_blocks); |
| fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks); |
| fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]); |
| fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks); |
| fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks); |
| fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks); |
| fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]); |
| fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips); |
| |
| fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks); |
| fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered); |
| |
| fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]); |
| fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]); |
| fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp); |
| fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]); |
| |
| fmt_printf("\nEncoding type histogram:\n"); |
| for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++) |
| fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]); |
| |
| fmt_printf("\nEndpoint mode histogram:\n"); |
| for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++) |
| fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]); |
| |
| fmt_printf("\nBlock mode histogram:\n"); |
| |
| uint32_t total_dp = 0, total_sp = 0; |
| uint32_t total_mode11 = 0, total_mode7 = 0; |
| uint32_t part_hist[3] = { 0 }; |
| uint32_t part2_mode7_total = 0, part2_mode11_total = 0; |
| uint32_t total_used_modes = 0; |
| for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++) |
| { |
| const auto& bm_desc = g_block_mode_descs[i]; |
| |
| const uint32_t total_uses = m_block_mode_hist[i]; |
| |
| if (bm_desc.m_dp) |
| total_dp += total_uses; |
| else |
| total_sp += total_uses; |
| |
| if (bm_desc.m_cem == 7) |
| total_mode7 += total_uses; |
| else |
| total_mode11 += total_uses; |
| |
| part_hist[bm_desc.m_num_partitions - 1] += total_uses; |
| |
| if (bm_desc.m_num_partitions == 2) |
| { |
| if (bm_desc.m_cem == 7) |
| part2_mode7_total += total_uses; |
| else |
| { |
| assert(bm_desc.m_cem == 11); |
| part2_mode11_total += total_uses; |
| } |
| } |
| |
| float avg_std_dev = 0.0f; |
| float avg_cross_correlations[3] = { 0 }; |
| |
| if (m_block_mode_comp_stats[i][0].size()) |
| { |
| const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32(); |
| |
| for (uint32_t j = 0; j < num_uses; j++) |
| avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev); |
| avg_std_dev /= (float)num_uses; |
| |
| for (uint32_t j = 0; j < num_uses; j++) |
| { |
| avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson); |
| avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson); |
| avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson); |
| } |
| |
| avg_cross_correlations[0] /= (float)num_uses; |
| avg_cross_correlations[1] /= (float)num_uses; |
| avg_cross_correlations[2] /= (float)num_uses; |
| } |
| |
| fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses, |
| bm_desc.m_cem, |
| bm_desc.m_dp, bm_desc.m_dp_channel, |
| bm_desc.m_num_partitions, |
| bm_desc.m_grid_x, bm_desc.m_grid_y, |
| astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range), |
| astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range), |
| total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f, |
| avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]); |
| |
| if (total_uses) |
| total_used_modes++; |
| } |
| |
| fmt_printf("Total used modes: {}\n", total_used_modes); |
| |
| fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp); |
| fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7); |
| fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]); |
| fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total); |
| } |
| }; |
| |
| struct uastc_hdr_6x6_encode_state |
| { |
| astc_hdr_codec_base_options master_coptions; |
| |
| imagef src_img; |
| |
| imagef src_img_filtered1; |
| imagef src_img_filtered2; |
| |
| imagef src_img_itp; |
| imagef src_img_filtered1_itp; |
| imagef src_img_filtered2_itp; |
| |
| vector2D<float> smooth_block_mse_scales; |
| |
| imagef packed_img; |
| |
| basisu::vector<bitwise_coder> strip_bits; |
| |
| basisu::vector2D<astc_helpers::astc_block> final_astc_blocks; |
| |
| vector2D<candidate_encoding> coded_blocks; |
| }; |
| |
| static bool compress_strip_task( |
| uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by, |
| uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height, |
| astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state) |
| { |
| BASISU_NOTE_UNUSED(num_blocks_y); |
| BASISU_NOTE_UNUSED(total_strips); |
| |
| vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x] |
| basisu::clear_obj(prev_comp_pixels); |
| |
| uint32_t prev_run_len = 0; |
| |
| bitwise_coder prev_encoding; |
| candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension |
| candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written |
| |
| bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index]; |
| |
| const uint32_t CANDIDATES_TO_RESERVE = 1536; |
| |
| basisu::vector<candidate_encoding> candidates; |
| candidates.reserve(CANDIDATES_TO_RESERVE); |
| |
| const bool use_orig_behavior = global_cfg.m_write_basisu_1_6_compatible_files; |
| |
| for (uint32_t by = strip_first_by; by <= strip_last_by; by++) |
| { |
| const bool has_upper_neighbor = by > strip_first_by; |
| |
| for (uint32_t bx = 0; bx < num_blocks_x; bx++) |
| { |
| //if ((bx == 1) && (by == 2)) |
| // basisu::fmt_printf("!"); |
| |
| for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++) |
| { |
| const bool has_left_neighbor = bx > 0; |
| //const bool has_prev = has_left_neighbor || has_upper_neighbor; |
| |
| // Select either the original source image, or the Gaussian filtered version. |
| // From here the encoder *must* use these 2 sources. |
| const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 : |
| ((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img); |
| |
| const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp : |
| ((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp); |
| |
| // Extract source image block |
| vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x] |
| pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); |
| |
| vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x] |
| pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); |
| |
| half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values |
| vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats |
| vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding |
| vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations |
| |
| bool is_grayscale = true; |
| |
| candidates.resize(0); |
| |
| float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f; |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| vec3F rgb_input; |
| |
| for (uint32_t c = 0; c < 3; c++) |
| { |
| float v = block_pixels[y][x][c]; |
| |
| rgb_input[c] = v; |
| |
| const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v); |
| assert(h == basist::float_to_half(v)); |
| |
| half_pixels[y][x][c] = h; |
| |
| block_pixels_q16[y][x][c] = (float)half_to_qlog16(h); |
| |
| half_pixels_as_floats[y][x][c] = (float)h; |
| |
| } // c |
| |
| float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B)); |
| if (py < block_ly) |
| block_ly = py; |
| if (py > block_hy) |
| block_hy = py; |
| block_avg_y += py; |
| |
| //linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]); |
| |
| block_pixels_as_itp[y][x] = block_pixels_itp[y][x]; |
| |
| block_pixels_q16[y][x][3] = 0.0f; |
| |
| if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2])) |
| is_grayscale = false; |
| |
| } // x |
| } // y |
| |
| block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS); |
| |
| encode_astc_block_stats enc_block_stats; |
| enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]); |
| |
| vec4F x_filtered[6][6], y_filtered[6][6]; |
| |
| filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal) |
| filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically) |
| |
| const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered); |
| const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered); |
| const bool filter_horizontally = filtered_x_err < filtered_y_err; |
| |
| //const float block_mag_gradient_mag = block_max_gradient_mag(bx, by); |
| |
| if (filter_horizontally) |
| debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed); |
| |
| vec3F lowpass_filtered[6][6]; |
| filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]); |
| float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]); |
| |
| const bool very_detailed_block = lowpass_std_dev > 350.0f; |
| const bool very_blurry_block = lowpass_std_dev < 30.0f; |
| const bool super_blurry_block = lowpass_std_dev < 15.0f; |
| |
| basisu::stats<float> half_comp_stats[3]; |
| for (uint32_t c = 0; c < 3; c++) |
| half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3); |
| |
| const float SINGLE_PART_HALF_THRESH = 256.0f; |
| const float COMPLEX_HALF_THRESH = 1024.0f; |
| // HACK HACK |
| const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f; |
| |
| const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev); |
| |
| const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH); |
| const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH); |
| const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH); |
| |
| // Dynamically choose a comp_level for this block. |
| astc_hdr_codec_base_options coptions(enc_state.master_coptions); |
| uint32_t comp_level = global_cfg.m_master_comp_level; |
| |
| if (very_complex_block) |
| comp_level = global_cfg.m_highest_comp_level; |
| else if (complex_block) |
| comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2; |
| |
| debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed); |
| |
| bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false; |
| BASISU_NOTE_UNUSED(any_2subset_mode11_enabled); |
| |
| for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) |
| { |
| if (comp_level == 0) |
| { |
| if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0) |
| continue; |
| } |
| else if (comp_level == 1) |
| { |
| if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0) |
| continue; |
| } |
| else if (comp_level == 2) |
| { |
| if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0) |
| continue; |
| } |
| |
| if (g_block_mode_descs[i].m_num_partitions == 2) |
| { |
| any_2subset_enabled = true; |
| |
| if (g_block_mode_descs[i].m_cem == 7) |
| { |
| any_2subset_mode7_enabled = true; |
| } |
| else |
| { |
| assert(g_block_mode_descs[i].m_cem == 11); |
| any_2subset_mode11_enabled = true; |
| } |
| } |
| else if (g_block_mode_descs[i].m_num_partitions == 3) |
| any_3subset_enabled = true; |
| } |
| |
| coptions.m_mode7_full_s_optimization = (comp_level >= 2); |
| |
| const bool uber_mode_flag = (comp_level >= 3); |
| coptions.m_allow_uber_mode = uber_mode_flag; |
| |
| coptions.m_ultra_quant = (comp_level >= 4); |
| |
| coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2); |
| coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2); |
| |
| coptions.m_disable_weight_plane_optimization = (comp_level >= 2); |
| |
| // ------------------- |
| |
| uint32_t total_used_block_chans = 0; |
| for (uint32_t i = 0; i < 3; i++) |
| total_used_block_chans += (half_comp_stats[i].m_range > 0.0f); |
| |
| const bool is_solid_block = (total_used_block_chans == 0); |
| |
| basisu::comparative_stats<float> half_cross_chan_stats[3]; |
| |
| // R vs. G |
| half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS, |
| &half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1], |
| 3, 3, |
| &half_comp_stats[0], &half_comp_stats[1]); |
| |
| // R vs. B |
| half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS, |
| &half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2], |
| 3, 3, |
| &half_comp_stats[0], &half_comp_stats[2]); |
| |
| // G vs. B |
| half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS, |
| &half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2], |
| 3, 3, |
| &half_comp_stats[1], &half_comp_stats[2]); |
| |
| const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson); |
| const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson); |
| const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson); |
| |
| float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL; |
| for (uint32_t i = 0; i < 3; i++) |
| { |
| #if 0 |
| // 9/5/2025, wrong metric, we're iterating channels pairs here, not individual channels. |
| // On 3 active channel blocks this causes no difference. |
| if (half_comp_stats[i].m_range > 0.0f) |
| #else |
| static const uint8_t s_chan_pairs[3][2] = { {0, 1}, {0, 2}, {1, 2} }; |
| |
| const uint32_t chanA = s_chan_pairs[i][0]; |
| const uint32_t chanB = s_chan_pairs[i][1]; |
| |
| if ((half_comp_stats[chanA].m_range > 0.0f) && (half_comp_stats[chanB].m_range > 0.0f)) |
| #endif |
| { |
| const float c = fabsf((float)half_cross_chan_stats[i].m_pearson); |
| min_corr = minimum(min_corr, c); |
| max_corr = maximum(max_corr, c); |
| } |
| } |
| |
| bool use_single_subset_mode7 = true; |
| if (comp_level <= 1) |
| { |
| // TODO: could also compute angle between principle axis and the grayscale axis. |
| // TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance |
| const float MODE7_MIN_CHAN_CORR = .5f; |
| const float MODE7_PCA_ANGLE_THRESH = .9f; |
| use_single_subset_mode7 = is_grayscale || is_solid_block || ((total_used_block_chans == 1) || (min_corr >= MODE7_MIN_CHAN_CORR)); |
| |
| if (use_single_subset_mode7) |
| { |
| float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f))); |
| if (cos_ang < MODE7_PCA_ANGLE_THRESH) |
| use_single_subset_mode7 = false; |
| } |
| } |
| |
| const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f); |
| |
| int desired_dp_chan = -1; |
| if (total_used_block_chans <= 1) |
| { |
| // no need for dual plane (except possibly 2x2 weight grids for RDO) |
| } |
| else |
| { |
| if (min_corr >= STRONG_CORR_THRESH) |
| { |
| // all channel pairs strongly correlated, no need for dual plane |
| debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed); |
| } |
| else |
| { |
| if (total_used_block_chans == 2) |
| { |
| if (half_comp_stats[0].m_range == 0.0f) |
| { |
| // r unused, check for strong gb correlation |
| if (gb_corr < STRONG_CORR_THRESH) |
| desired_dp_chan = 1; |
| } |
| else if (half_comp_stats[1].m_range == 0.0f) |
| { |
| // g unused, check for strong rb correlation |
| if (rb_corr < STRONG_CORR_THRESH) |
| desired_dp_chan = 0; |
| } |
| else |
| { |
| // b unused, check for strong rg correlation |
| if (rg_corr < STRONG_CORR_THRESH) |
| desired_dp_chan = 0; |
| } |
| } |
| else |
| { |
| assert(total_used_block_chans == 3); |
| |
| // see if rg/rb is weakly correlated vs. gb |
| if ((rg_corr < gb_corr) && (rb_corr < gb_corr)) |
| desired_dp_chan = 0; |
| // see if gr/gb is weakly correlated vs. rb |
| else if ((rg_corr < rb_corr) && (gb_corr < rb_corr)) |
| desired_dp_chan = 1; |
| // assume b is weakest |
| else |
| desired_dp_chan = 2; |
| } |
| |
| if (desired_dp_chan == -1) |
| debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed); |
| else |
| debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed); |
| } |
| } |
| |
| // 2x2 is special for RDO at higher lambdas - always pick a preferred channel. |
| int desired_dp_chan_2x2 = 0; |
| if (total_used_block_chans == 2) |
| { |
| if (half_comp_stats[0].m_range == 0.0f) |
| desired_dp_chan_2x2 = 1; |
| } |
| else if (total_used_block_chans == 3) |
| { |
| // see if rg/rb is weakly correlated vs. gb |
| if ((rg_corr < gb_corr) && (rb_corr < gb_corr)) |
| desired_dp_chan_2x2 = 0; |
| // see if gr/gb is weakly correlated vs. rb |
| else if ((rg_corr < rb_corr) && (gb_corr < rb_corr)) |
| desired_dp_chan_2x2 = 1; |
| // assume b is weakest |
| else |
| desired_dp_chan_2x2 = 2; |
| } |
| |
| // Gather all candidate encodings |
| bool status = false; |
| |
| // ---- Run candidate |
| if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor)) |
| { |
| candidate_encoding candidate; |
| candidate.m_coder.reserve(24); |
| |
| candidate.m_encoding_type = encoding_type::cRun; |
| |
| candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk; |
| candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk; |
| |
| memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels)); |
| |
| if (!prev_run_len) |
| { |
| candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN); |
| candidate.m_coder.put_vlc(0, 5); |
| } |
| else |
| { |
| // extend current run - compute the # of new bits needed for the extension. |
| |
| uint32_t prev_run_bits = prev_encoding.get_total_bits_u32(); |
| assert(prev_run_bits > 0); |
| |
| // We're not actually going to code this, because the previously emitted run code will be extended. |
| bitwise_coder temp_coder; |
| temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN); |
| temp_coder.put_vlc((prev_run_len + 1) - 1, 5); |
| |
| uint32_t cur_run_bits = temp_coder.get_total_bits_u32(); |
| assert(cur_run_bits >= prev_run_bits); |
| |
| uint32_t total_new_bits = cur_run_bits - prev_run_bits; |
| if (total_new_bits > 0) |
| candidate.m_coder.put_bits(0, total_new_bits); // dummy bits |
| } |
| |
| candidate.m_run_len = prev_run_len + 1; |
| |
| candidates.emplace_back(std::move(candidate)); |
| } |
| |
| // ---- Reuse candidate |
| if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f)) |
| { |
| for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++) |
| { |
| const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x; |
| const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y; |
| |
| const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y; |
| if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x)) |
| continue; |
| if (reuse_by < (int)strip_first_by) |
| break; |
| |
| const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by); |
| |
| // TODO - support this. |
| if (prev_candidate.m_encoding_type == encoding_type::cSolid) |
| continue; |
| assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse)); |
| |
| candidate_encoding candidate; |
| candidate.m_coder.reserve(24); |
| astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; |
| astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk; |
| |
| const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk; |
| |
| const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height; |
| const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane; |
| const uint32_t num_grid_samples = grid_x * grid_y; |
| const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]); |
| |
| coded_log_blk = prev_candidate.m_coded_log_blk; |
| decomp_log_blk = prev_candidate.m_decomp_log_blk; |
| |
| if (prev_coded_log_blk.m_num_partitions == 1) |
| { |
| // Now encode the block using the transcoded endpoints |
| basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; |
| |
| if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7) |
| { |
| status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr, |
| astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); |
| } |
| else |
| { |
| status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr, |
| astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); |
| } |
| assert(status); |
| |
| uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H]; |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; |
| |
| if (dual_plane) |
| { |
| eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector, |
| BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); |
| |
| downsample_ise_weights_dual_plane( |
| coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| trial_weights0, trial_weights1, coded_log_blk.m_weights); |
| |
| basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); |
| } |
| else |
| { |
| eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); |
| |
| downsample_ise_weights( |
| coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| trial_weights0, coded_log_blk.m_weights); |
| |
| basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); |
| } |
| |
| // Create the block the decoder would transcode into. |
| copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior); |
| } |
| else if (prev_coded_log_blk.m_num_partitions == 2) |
| { |
| assert(!dual_plane); |
| |
| const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id]; |
| assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2)); |
| |
| const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index]; |
| |
| vec4F part_pixels_q16[2][64]; |
| half_vec3 part_half_pixels[2][64]; |
| uint32_t part_total_pixels[2] = { 0 }; |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = pat_vec[x + y * 6]; |
| |
| uint32_t l = part_total_pixels[part_index]; |
| |
| part_pixels_q16[part_index][l] = block_pixels_q16[y][x]; |
| part_half_pixels[part_index][l] = half_pixels[y][x]; |
| |
| part_total_pixels[part_index] = l + 1; |
| } // x |
| } // y |
| |
| uint8_t blk_weights[2][BLOCK_W * BLOCK_H]; |
| |
| for (uint32_t part_index = 0; part_index < 2; part_index++) |
| { |
| basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; |
| |
| if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7) |
| { |
| status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, |
| astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); |
| } |
| else |
| { |
| status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, |
| astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); |
| } |
| assert(status); |
| |
| eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range, |
| (basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); |
| |
| } // part_index |
| |
| uint8_t ise_weights[BLOCK_W * BLOCK_H]; |
| |
| uint32_t src_pixel_index[2] = { 0, 0 }; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = pat_vec[x + y * 6]; |
| |
| ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; |
| src_pixel_index[part_index]++; |
| } // x |
| } // y |
| |
| downsample_ise_weights( |
| coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| ise_weights, coded_log_blk.m_weights); |
| |
| // Transcode these codable weights to ASTC weights. |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H]; |
| basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); |
| |
| // Create the block the decoder would transcode into. |
| copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior); |
| } |
| else if (prev_coded_log_blk.m_num_partitions == 3) |
| { |
| assert(!dual_plane); |
| |
| const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id]; |
| assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3)); |
| |
| const partition_pattern_vec& pat = g_partitions3[unique_pat_index]; |
| |
| vec4F part_pixels_q16[3][64]; |
| half_vec3 part_half_pixels[3][64]; |
| uint32_t part_total_pixels[3] = { 0 }; |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = pat.m_parts[x + y * BLOCK_W]; |
| |
| uint32_t l = part_total_pixels[part_index]; |
| |
| part_pixels_q16[part_index][l] = block_pixels_q16[y][x]; |
| part_half_pixels[part_index][l] = half_pixels[y][x]; |
| |
| part_total_pixels[part_index] = l + 1; |
| } // x |
| } // y |
| |
| uint8_t blk_weights[3][BLOCK_W * BLOCK_H]; |
| |
| for (uint32_t part_index = 0; part_index < 3; part_index++) |
| { |
| basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; |
| |
| status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, |
| astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); |
| assert(status); |
| |
| eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range, |
| (basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); |
| |
| } // part_index |
| |
| uint8_t ise_weights[BLOCK_W * BLOCK_H]; |
| |
| uint32_t src_pixel_index[3] = { 0 }; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| const uint32_t part_index = pat.m_parts[x + y * BLOCK_W]; |
| |
| ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; |
| src_pixel_index[part_index]++; |
| } // x |
| } // y |
| |
| downsample_ise_weights( |
| coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| ise_weights, coded_log_blk.m_weights); |
| |
| // Transcode these codable weights to ASTC weights. |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H]; |
| basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); |
| |
| // Create the block the decoder would transcode into. |
| copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior); |
| } |
| |
| if (!validate_log_blk(decomp_log_blk)) |
| { |
| fmt_error_printf("pack_astc_block() failed\n"); |
| return false; |
| } |
| |
| status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]); |
| if (!status) |
| { |
| fmt_error_printf("decode_astc_block() failed\n"); |
| return false; |
| } |
| |
| candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN); |
| candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS); |
| encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range); |
| |
| candidate.m_encoding_type = encoding_type::cReuse; |
| candidate.m_block_mode = prev_candidate.m_block_mode; |
| candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode; |
| candidate.m_reuse_delta_index = reuse_delta_index; |
| |
| candidates.emplace_back(std::move(candidate)); |
| |
| } // reuse_delta_index |
| } |
| |
| // ---- Solid candidate |
| if (global_cfg.m_use_solid_blocks) |
| { |
| candidate_encoding candidate; |
| candidate.m_coder.reserve(24); |
| |
| // solid |
| candidate.m_encoding_type = encoding_type::cSolid; |
| |
| float r = 0.0f, g = 0.0f, b = 0.0f; |
| const float LOG_BIAS = .125f; |
| bool solid_block = true; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| if ((block_pixels[0][0][0] != block_pixels[y][x][0]) || |
| (block_pixels[0][0][1] != block_pixels[y][x][1]) || |
| (block_pixels[0][0][2] != block_pixels[y][x][2])) |
| { |
| solid_block = false; |
| } |
| |
| r += log2f(block_pixels[y][x][0] + LOG_BIAS); |
| g += log2f(block_pixels[y][x][1] + LOG_BIAS); |
| b += log2f(block_pixels[y][x][2] + LOG_BIAS); |
| } |
| } |
| |
| if (solid_block) |
| { |
| r = block_pixels[0][0][0]; |
| g = block_pixels[0][0][1]; |
| b = block_pixels[0][0][2]; |
| } |
| else |
| { |
| r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); |
| g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); |
| b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); |
| |
| r = minimum<float>(r, basist::MAX_HALF_FLOAT); |
| g = minimum<float>(g, basist::MAX_HALF_FLOAT); |
| b = minimum<float>(b, basist::MAX_HALF_FLOAT); |
| } |
| |
| basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b); |
| |
| candidate.m_solid_color[0] = rh; |
| candidate.m_solid_color[1] = gh; |
| candidate.m_solid_color[2] = bh; |
| |
| candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN); |
| |
| candidate.m_coder.put_bits(rh, 15); |
| candidate.m_coder.put_bits(gh, 15); |
| candidate.m_coder.put_bits(bh, 15); |
| |
| vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh)); |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| candidate.m_comp_pixels[y][x] = cp; |
| |
| astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk; |
| |
| log_blk.clear(); |
| log_blk.m_solid_color_flag_hdr = true; |
| log_blk.m_solid_color[0] = rh; |
| log_blk.m_solid_color[1] = gh; |
| log_blk.m_solid_color[2] = bh; |
| log_blk.m_solid_color[3] = basist::float_to_half(1.0f); |
| |
| candidate.m_decomp_log_blk = log_blk; |
| |
| candidates.emplace_back(std::move(candidate)); |
| } |
| |
| if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks)) |
| { |
| static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 }; |
| static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 }; |
| |
| static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 }; |
| static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 }; |
| |
| static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 }; |
| static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 }; |
| |
| uint32_t total_parts2 = 0, total_parts3 = 0; |
| |
| assert(comp_level < 5); |
| if ((very_simple_block) && (comp_level <= 3)) |
| { |
| // Block's std dev is so low that 2-3 subsets are unlikely to help much |
| total_parts2 = 0; |
| total_parts3 = 0; |
| |
| debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed); |
| } |
| else if (very_complex_block) |
| { |
| total_parts2 = s_parts2_very_complex[comp_level]; |
| total_parts3 = s_parts3_very_complex[comp_level]; |
| |
| if (global_cfg.m_extra_patterns_flag) |
| { |
| total_parts2 += (comp_level == 4) ? 30 : 20; |
| total_parts3 += (comp_level == 4) ? 30 : 20; |
| } |
| |
| debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed); |
| } |
| else if (complex_block) |
| { |
| total_parts2 = s_parts2_complex[comp_level]; |
| total_parts3 = s_parts3_complex[comp_level]; |
| |
| if (global_cfg.m_extra_patterns_flag) |
| { |
| total_parts2 += (comp_level == 4) ? 15 : 10; |
| total_parts3 += (comp_level == 4) ? 15 : 10; |
| } |
| |
| debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed); |
| } |
| else |
| { |
| // moderate complexity - use defaults |
| total_parts2 = s_parts2_normal[comp_level]; |
| total_parts3 = s_parts3_normal[comp_level]; |
| |
| if (global_cfg.m_extra_patterns_flag) |
| { |
| total_parts2 += 5; |
| total_parts3 += 5; |
| } |
| |
| debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed); |
| } |
| |
| if (!any_2subset_enabled) |
| total_parts2 = 0; |
| |
| if (!any_3subset_enabled) |
| total_parts3 = 0; |
| |
| int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2]; |
| bool has_estimated_parts2 = false; |
| |
| if (total_parts2) |
| { |
| if (global_cfg.m_brute_force_partition_matching) |
| { |
| int candidate_pats2[NUM_UNIQUE_PARTITIONS2]; |
| for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++) |
| candidate_pats2[i] = i; |
| |
| if (any_2subset_enabled) |
| { |
| estimate_partitions_mode7_and_11( |
| 2, |
| NUM_UNIQUE_PARTITIONS2, g_partitions2, |
| NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2, |
| &half_pixels_as_floats[0][0], |
| coptions, |
| total_parts2, best_parts2_mode11, best_parts2_mode7); |
| } |
| |
| has_estimated_parts2 = true; |
| } |
| else |
| { |
| if (comp_level >= 1) |
| { |
| const uint32_t MAX_CANDIDATES2 = 48; |
| int candidate_pats2[MAX_CANDIDATES2 * 2]; |
| |
| uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2)); |
| num_candidate_pats2 = minimum<uint32_t>(num_candidate_pats2, (uint32_t)std::size(candidate_pats2)); |
| |
| has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2); |
| |
| if (has_estimated_parts2) |
| { |
| estimate_partitions_mode7_and_11( |
| 2, |
| NUM_UNIQUE_PARTITIONS2, g_partitions2, |
| num_candidate_pats2, (uint32_t*)candidate_pats2, |
| &half_pixels_as_floats[0][0], |
| coptions, |
| total_parts2, best_parts2_mode11, best_parts2_mode7); |
| } |
| } |
| else |
| { |
| has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2); |
| |
| if ((has_estimated_parts2) && (any_2subset_mode7_enabled)) |
| memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0])); |
| } |
| } |
| } |
| |
| int best_parts3[NUM_UNIQUE_PARTITIONS3]; |
| bool has_estimated_parts3 = false; |
| |
| if (total_parts3) |
| { |
| #if 0 |
| has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3); |
| #elif 1 |
| if (global_cfg.m_brute_force_partition_matching) |
| { |
| int candidate_pats3[NUM_UNIQUE_PARTITIONS3]; |
| for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++) |
| candidate_pats3[i] = i; |
| |
| estimate_partitions_mode7( |
| 3, |
| NUM_UNIQUE_PARTITIONS3, g_partitions3, |
| NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3, |
| &half_pixels_as_floats[0][0], |
| coptions, |
| total_parts3, (uint32_t*)best_parts3); |
| |
| has_estimated_parts3 = true; |
| } |
| else |
| { |
| const uint32_t MAX_CANDIDATES3 = 48; |
| int candidate_pats3[MAX_CANDIDATES3 * 2]; |
| |
| uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2)); |
| num_candidate_pats3 = minimum<uint32_t>(num_candidate_pats3, (uint32_t)std::size(candidate_pats3)); |
| |
| has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3); |
| |
| if (has_estimated_parts3) |
| { |
| estimate_partitions_mode7( |
| 3, |
| NUM_UNIQUE_PARTITIONS3, g_partitions3, |
| num_candidate_pats3, (uint32_t*)candidate_pats3, |
| &half_pixels_as_floats[0][0], |
| coptions, |
| total_parts3, (uint32_t*)best_parts3); |
| } |
| } |
| #endif |
| } |
| |
| const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares; |
| |
| // ---- Encoded block candidate |
| for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++) |
| { |
| const block_mode bm = (block_mode)block_mode_iter; |
| |
| if (comp_level == 0) |
| { |
| if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0) |
| continue; |
| } |
| else if (comp_level == 1) |
| { |
| if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0) |
| continue; |
| } |
| else if (comp_level == 2) |
| { |
| if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0) |
| continue; |
| } |
| |
| if (global_cfg.m_block_stat_optimizations_flag) |
| { |
| if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp)) |
| { |
| if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2)) |
| { |
| if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2) |
| continue; |
| } |
| else |
| { |
| if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan) |
| continue; |
| } |
| } |
| |
| if (comp_level <= 3) |
| { |
| const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x; |
| const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y; |
| |
| if (!g_block_mode_descs[block_mode_iter].m_dp) |
| { |
| // Minor gain (.5-1% less canidates) |
| if (very_detailed_block) |
| { |
| if (grid_x * grid_y <= 12) |
| { |
| debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| } |
| |
| // Major gains (10-25% less candidates) |
| if (very_blurry_block) |
| { |
| if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1)) |
| { |
| debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| } |
| if (super_blurry_block) |
| { |
| if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1)) |
| { |
| debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| } |
| } |
| |
| if (grid_x != grid_y) |
| { |
| if (grid_x < grid_y) |
| { |
| if (!filter_horizontally) |
| { |
| debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| } |
| else |
| { |
| if (filter_horizontally) |
| { |
| debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| } |
| } |
| } |
| |
| if (global_cfg.m_lambda == 0.0f) |
| { |
| // Rarely useful if lambda=0 |
| if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2)) |
| continue; |
| } |
| } // block_stat_optimizations_flag |
| |
| if ((!use_single_subset_mode7) && |
| (g_block_mode_descs[block_mode_iter].m_cem == 7) && |
| (g_block_mode_descs[block_mode_iter].m_num_partitions == 1)) |
| { |
| debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| |
| for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++) |
| { |
| if (global_cfg.m_lambda == 0.0f) |
| { |
| // No use trying anything else |
| if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw) |
| continue; |
| } |
| |
| if (global_cfg.m_disable_delta_endpoint_usage) |
| { |
| if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta)) |
| continue; |
| } |
| |
| if (!global_cfg.m_favor_higher_compression) |
| { |
| if (comp_level == 0) |
| { |
| if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) |
| continue; |
| } |
| |
| if (comp_level <= 1) |
| { |
| if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper)) |
| continue; |
| } |
| } |
| |
| const endpoint_mode em = (endpoint_mode)endpoint_mode_iter; |
| |
| switch (em) |
| { |
| case endpoint_mode::cUseLeft: |
| case endpoint_mode::cUseUpper: |
| { |
| const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter]; |
| const uint32_t cem = local_md.m_cem; |
| |
| if (local_md.m_num_partitions > 1) |
| break; |
| |
| if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor)) |
| break; |
| else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor)) |
| break; |
| |
| candidate_encoding candidate; |
| candidate.m_coder.reserve(24); |
| astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; |
| |
| int nx = bx, ny = by; |
| if (em == endpoint_mode::cUseLeft) |
| nx--; |
| else |
| ny--; |
| |
| const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny); |
| if (neighbor_blk.m_encoding_type == encoding_type::cSolid) |
| break; |
| assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse)); |
| |
| const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode]; |
| |
| if (neighbor_md.m_cem != cem) |
| break; |
| |
| assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem); |
| |
| const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y; |
| const bool dual_plane = local_md.m_dp; |
| const uint32_t num_grid_samples = grid_x * grid_y; |
| const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem); |
| |
| coded_log_blk.m_grid_width = (uint8_t)grid_x; |
| coded_log_blk.m_grid_height = (uint8_t)grid_y; |
| coded_log_blk.m_dual_plane = (uint8_t)dual_plane; |
| coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; |
| coded_log_blk.m_num_partitions = 1; |
| coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem; |
| coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range; |
| |
| // We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss). |
| coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range; |
| memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals); |
| |
| uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS]; |
| |
| // Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding. |
| basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, |
| neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints, |
| local_md.m_transcode_endpoint_ise_range, transcode_endpoints); |
| |
| // Now encode the block using the transcoded endpoints |
| basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; |
| |
| if (cem == 7) |
| { |
| status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr, |
| astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range); |
| } |
| else |
| { |
| status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr, |
| astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range); |
| } |
| if (!status) |
| break; |
| |
| uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H]; |
| if (dual_plane) |
| { |
| eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); |
| |
| downsample_ise_weights_dual_plane( |
| local_md.m_weight_ise_range, local_md.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| trial_weights0, trial_weights1, coded_log_blk.m_weights); |
| } |
| else |
| { |
| eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); |
| |
| downsample_ise_weights( |
| local_md.m_weight_ise_range, local_md.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| trial_weights0, coded_log_blk.m_weights); |
| } |
| |
| // Transcode these codable weights to ASTC weights. |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; |
| basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range); |
| |
| // Create the block the decoder would transcode into. |
| astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; |
| decomp_blk.clear(); |
| |
| decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; |
| decomp_blk.m_dual_plane = local_md.m_dp; |
| decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; |
| decomp_blk.m_num_partitions = 1; |
| decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range; |
| decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range; |
| |
| memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals); |
| |
| copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior); |
| |
| if (!validate_log_blk(decomp_blk)) |
| { |
| fmt_error_printf("pack_astc_block() failed\n"); |
| return false; |
| } |
| |
| status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); |
| if (!status) |
| { |
| fmt_error_printf("decode_astc_block() failed\n"); |
| return false; |
| } |
| |
| candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); |
| code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr); |
| |
| candidate.m_encoding_type = encoding_type::cBlock; |
| candidate.m_endpoint_mode = em; |
| candidate.m_block_mode = bm; |
| |
| candidates.emplace_back(std::move(candidate)); |
| |
| break; |
| } |
| case endpoint_mode::cUseLeftDelta: |
| case endpoint_mode::cUseUpperDelta: |
| { |
| const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter]; |
| const uint32_t cem = local_md.m_cem; |
| |
| if (local_md.m_num_partitions > 1) |
| break; |
| |
| if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor)) |
| break; |
| else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor)) |
| break; |
| |
| candidate_encoding candidate; |
| candidate.m_coder.reserve(24); |
| astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; |
| |
| int nx = bx, ny = by; |
| if (em == endpoint_mode::cUseLeftDelta) |
| nx--; |
| else |
| ny--; |
| |
| const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny); |
| if (neighbor_blk.m_encoding_type == encoding_type::cSolid) |
| break; |
| assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse)); |
| |
| const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode]; |
| |
| if (neighbor_md.m_cem != cem) |
| break; |
| |
| assert(neighbor_md.m_cem == local_md.m_cem); |
| |
| const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y; |
| const bool dual_plane = local_md.m_dp; |
| const uint32_t num_grid_samples = grid_x * grid_y; |
| const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem); |
| |
| // Dequantize neighbor's endpoints to ISE 20 |
| uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS]; |
| basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, |
| neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints, |
| astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20); |
| |
| // Requantize neighbor's endpoints to our local desired coding ISE range |
| uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS]; |
| basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local); |
| |
| uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS]; |
| uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS]; |
| |
| // Now try to encode the current block using the neighbor's endpoints submode. |
| double err = 0.0f; |
| uint32_t best_submode = 0; |
| |
| if (cem == 7) |
| { |
| int maj_index, submode_index; |
| decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index); |
| |
| int first_submode = submode_index, last_submode = submode_index; |
| |
| err = encode_astc_hdr_block_mode_7( |
| NUM_BLOCK_PIXELS, |
| (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, |
| local_md.m_weight_ise_range, |
| best_submode, |
| BIG_FLOAT_VAL, |
| blk_endpoints, blk_weights0, |
| coptions, |
| local_md.m_endpoint_ise_range, |
| first_submode, last_submode, |
| &enc_block_stats); |
| } |
| else |
| { |
| int maj_index, submode_index; |
| decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index); |
| |
| int first_submode = -1, last_submode = -1; |
| if (maj_index == 3) |
| { |
| // direct |
| } |
| else |
| { |
| first_submode = submode_index; |
| last_submode = submode_index; |
| } |
| |
| if (dual_plane) |
| { |
| err = encode_astc_hdr_block_mode_11_dual_plane( |
| NUM_BLOCK_PIXELS, |
| (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, |
| local_md.m_dp_channel, |
| local_md.m_weight_ise_range, |
| best_submode, |
| BIG_FLOAT_VAL, |
| blk_endpoints, blk_weights0, blk_weights1, |
| coptions, |
| false, |
| local_md.m_endpoint_ise_range, |
| false, //uber_mode_flag, |
| false, |
| first_submode, last_submode, true); |
| } |
| else |
| { |
| err = encode_astc_hdr_block_mode_11( |
| NUM_BLOCK_PIXELS, |
| (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, |
| local_md.m_weight_ise_range, |
| best_submode, |
| BIG_FLOAT_VAL, |
| blk_endpoints, blk_weights0, |
| coptions, |
| false, |
| local_md.m_endpoint_ise_range, |
| false, //uber_mode_flag, |
| false, |
| first_submode, last_submode, true, |
| mode11_opt_mode, |
| &enc_block_stats); |
| } |
| } |
| |
| if (err == BIG_FLOAT_VAL) |
| break; |
| |
| uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS]; |
| |
| // TODO: For now, just try 5 bits for each endpoint. Can tune later. |
| // This isn't right, it's computing the deltas in ISE space. |
| //const uint32_t NUM_ENDPOINT_DELTA_BITS = 5; |
| const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS; |
| const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1; |
| |
| const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank; |
| |
| bool all_deltas_in_limits = true; |
| for (uint32_t i = 0; i < num_endpoint_vals; i++) |
| { |
| int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]]; |
| |
| if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit)) |
| all_deltas_in_limits = false; |
| |
| endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit); |
| } |
| |
| if (all_deltas_in_limits) |
| { |
| coded_log_blk.m_grid_width = (uint8_t)grid_x; |
| coded_log_blk.m_grid_height = (uint8_t)grid_y; |
| coded_log_blk.m_dual_plane = (uint8_t)dual_plane; |
| coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; |
| coded_log_blk.m_num_partitions = 1; |
| coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; |
| coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range; |
| coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range; |
| |
| memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals); |
| |
| uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS]; |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; |
| |
| basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints); |
| |
| if (dual_plane) |
| { |
| downsample_ise_weights_dual_plane( |
| local_md.m_weight_ise_range, local_md.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| blk_weights0, blk_weights1, |
| coded_log_blk.m_weights); |
| } |
| else |
| { |
| downsample_ise_weights( |
| local_md.m_weight_ise_range, local_md.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| blk_weights0, coded_log_blk.m_weights); |
| } |
| |
| basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range); |
| |
| // Create the block the decoder would transcode into. |
| |
| astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; |
| decomp_blk.clear(); |
| |
| decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; |
| decomp_blk.m_dual_plane = local_md.m_dp; |
| decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; |
| decomp_blk.m_num_partitions = 1; |
| decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range; |
| decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range; |
| |
| memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals); |
| |
| copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior); |
| |
| if (!validate_log_blk(decomp_blk)) |
| { |
| fmt_error_printf("pack_astc_block() failed\n"); |
| return false; |
| } |
| |
| status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); |
| if (!status) |
| { |
| fmt_error_printf("decode_astc_block() failed\n"); |
| return false; |
| } |
| |
| candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); |
| code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas); |
| |
| candidate.m_encoding_type = encoding_type::cBlock; |
| candidate.m_endpoint_mode = em; |
| candidate.m_block_mode = bm; |
| |
| candidates.emplace_back(std::move(candidate)); |
| } |
| |
| break; |
| } |
| case endpoint_mode::cRaw: |
| { |
| //if (candidates.size() == 339) |
| // fmt_printf("!"); |
| |
| const auto& mode_desc = g_block_mode_descs[(uint32_t)bm]; |
| const uint32_t cem = mode_desc.m_cem; |
| //const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem); |
| const bool dual_plane = mode_desc.m_dp; |
| |
| if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2)) |
| break; |
| |
| if (mode_desc.m_num_partitions == 3) |
| { |
| assert(!dual_plane); |
| |
| if (!has_estimated_parts3) |
| break; |
| |
| assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range); |
| assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range); |
| |
| trial_result res; |
| |
| status = encode_block_3_subsets( |
| res, |
| cem, |
| mode_desc.m_grid_x, mode_desc.m_grid_y, |
| mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, |
| &half_pixels[0][0], (vec4F*)block_pixels_q16, |
| coptions, |
| uber_mode_flag, |
| best_parts3, total_parts3, comp_level, mode11_opt_mode); |
| |
| if (!status) |
| break; |
| |
| assert(res.m_valid); |
| |
| candidate_encoding candidate; |
| candidate.m_coder.reserve(24); |
| astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; |
| |
| coded_log_blk = res.m_log_blk; |
| |
| astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; |
| decomp_blk = res.m_log_blk; |
| |
| if (!validate_log_blk(decomp_blk)) |
| { |
| fmt_error_printf("pack_astc_block() failed\n"); |
| return false; |
| } |
| |
| status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); |
| if (!status) |
| { |
| fmt_error_printf("decode_astc_block() failed\n"); |
| return false; |
| } |
| |
| candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); |
| code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); |
| |
| candidate.m_encoding_type = encoding_type::cBlock; |
| candidate.m_endpoint_mode = em; |
| candidate.m_block_mode = bm; |
| |
| candidates.emplace_back(std::move(candidate)); |
| } |
| else if (mode_desc.m_num_partitions == 2) |
| { |
| assert(!dual_plane); |
| |
| if (!has_estimated_parts2) |
| break; |
| |
| assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range); |
| assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range); |
| |
| for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++) |
| { |
| trial_result results[2]; |
| |
| assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled)); |
| |
| status = encode_block_2_subsets( |
| results, |
| mode_desc.m_grid_x, mode_desc.m_grid_y, |
| mode_desc.m_cem, |
| mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, |
| &half_pixels[0][0], (vec4F*)block_pixels_q16, |
| coptions, |
| uber_mode_flag, |
| (cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter], |
| comp_level, |
| mode11_opt_mode, |
| true); |
| |
| if (!status) |
| continue; |
| |
| for (uint32_t r_iter = 0; r_iter < 2; r_iter++) |
| { |
| const trial_result& res = results[r_iter]; |
| |
| if (!res.m_valid) |
| continue; |
| |
| candidate_encoding candidate; |
| candidate.m_coder.reserve(24); |
| astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; |
| |
| coded_log_blk = res.m_log_blk; |
| |
| astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; |
| decomp_blk = res.m_log_blk; |
| |
| if (!validate_log_blk(decomp_blk)) |
| { |
| fmt_error_printf("pack_astc_block() failed\n"); |
| return false; |
| } |
| |
| status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); |
| if (!status) |
| { |
| fmt_error_printf("decode_astc_block() failed\n"); |
| return false; |
| } |
| |
| candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); |
| code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); |
| |
| candidate.m_encoding_type = encoding_type::cBlock; |
| candidate.m_endpoint_mode = em; |
| candidate.m_block_mode = bm; |
| |
| candidates.emplace_back(std::move(candidate)); |
| |
| } // r_iter |
| } |
| } |
| else |
| { |
| // 1 subset |
| uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H]; |
| uint32_t best_submode = 0; |
| |
| candidate_encoding candidate; |
| candidate.m_coder.reserve(24); |
| astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; |
| |
| const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y; |
| const uint32_t num_grid_samples = grid_x * grid_y; |
| |
| const half_vec3* pBlock_pixels_half = &half_pixels[0][0]; |
| const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0]; |
| |
| const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1); |
| |
| uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; |
| |
| coded_log_blk.m_grid_width = (uint8_t)grid_x; |
| coded_log_blk.m_grid_height = (uint8_t)grid_y; |
| coded_log_blk.m_dual_plane = (uint8_t)dual_plane; |
| coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel; |
| coded_log_blk.m_num_partitions = 1; |
| coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem; |
| coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range; |
| coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range; |
| |
| if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H))) |
| { |
| double e = encode_astc_hdr_block_downsampled_mode_11( |
| BLOCK_W, BLOCK_H, grid_x, grid_y, |
| mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, |
| NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, |
| BIG_FLOAT_VAL, |
| FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode, |
| coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode, |
| coptions, |
| &enc_block_stats); |
| |
| if (e == BIG_FLOAT_VAL) |
| break; |
| } |
| else |
| { |
| if (cem == 7) |
| { |
| assert(!dual_plane); |
| |
| double e = encode_astc_hdr_block_mode_7( |
| NUM_BLOCK_PIXELS, |
| (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, |
| mode_desc.m_weight_ise_range, |
| best_submode, |
| BIG_FLOAT_VAL, |
| coded_log_blk.m_endpoints, |
| blk_weights0, |
| coptions, |
| mode_desc.m_endpoint_ise_range, |
| 0, MAX_MODE7_SUBMODE_INDEX, |
| &enc_block_stats); |
| BASISU_NOTE_UNUSED(e); |
| } |
| else |
| { |
| double e; |
| |
| if (dual_plane) |
| { |
| e = encode_astc_hdr_block_mode_11_dual_plane( |
| NUM_BLOCK_PIXELS, |
| (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, |
| mode_desc.m_dp_channel, |
| mode_desc.m_weight_ise_range, |
| best_submode, |
| BIG_FLOAT_VAL, |
| coded_log_blk.m_endpoints, |
| blk_weights0, blk_weights1, |
| coptions, |
| false, |
| mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false); |
| } |
| else |
| { |
| e = encode_astc_hdr_block_mode_11( |
| NUM_BLOCK_PIXELS, |
| (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, |
| mode_desc.m_weight_ise_range, |
| best_submode, |
| BIG_FLOAT_VAL, |
| coded_log_blk.m_endpoints, |
| blk_weights0, |
| coptions, |
| false, |
| mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false, |
| mode11_opt_mode, |
| &enc_block_stats); |
| } |
| |
| if (e == BIG_FLOAT_VAL) |
| break; |
| } |
| |
| if (dual_plane) |
| { |
| downsample_ise_weights_dual_plane( |
| mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| blk_weights0, blk_weights1, |
| coded_log_blk.m_weights); |
| } |
| else |
| { |
| downsample_ise_weights( |
| mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range, |
| BLOCK_W, BLOCK_H, |
| grid_x, grid_y, |
| blk_weights0, coded_log_blk.m_weights); |
| |
| if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H))) |
| { |
| bool refine_status = refine_endpoints(cem, |
| mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, |
| 6, 6, mode_desc.m_grid_x, mode_desc.m_grid_y, |
| coded_log_blk.m_weights, mode_desc.m_weight_ise_range, |
| BLOCK_W * BLOCK_H, |
| (basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16, |
| nullptr, |
| coptions, mode11_opt_mode); |
| BASISU_NOTE_UNUSED(refine_status); |
| } |
| } |
| } |
| |
| basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range); |
| |
| // Create the block the decoder would transcode into. |
| astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; |
| decomp_blk.clear(); |
| |
| decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem; |
| decomp_blk.m_dual_plane = mode_desc.m_dp; |
| decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel; |
| decomp_blk.m_num_partitions = 1; |
| decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range; |
| decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range; |
| |
| basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); |
| |
| copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior); |
| |
| if (!validate_log_blk(decomp_blk)) |
| { |
| fmt_error_printf("pack_astc_block() failed\n"); |
| return false; |
| } |
| |
| status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); |
| if (!status) |
| { |
| fmt_error_printf("decode_astc_block() failed\n"); |
| return false; |
| } |
| |
| candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); |
| code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); |
| |
| candidate.m_encoding_type = encoding_type::cBlock; |
| candidate.m_endpoint_mode = em; |
| candidate.m_block_mode = bm; |
| |
| candidates.emplace_back(std::move(candidate)); |
| } |
| |
| break; |
| } |
| default: |
| assert(0); |
| fmt_debug_printf("Invalid endpoint mode\n"); |
| return false; |
| |
| } // switch (em) |
| |
| } // endpoint_mode_iter |
| |
| } // block_mode_iter |
| |
| } // is_solid_block |
| |
| //------------------------------------------------ |
| |
| debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed); |
| atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32()); |
| |
| for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) |
| { |
| auto& candidate = candidates[candidate_iter]; |
| |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg); |
| } |
| |
| // Find best overall candidate |
| double best_t = DBL_MAX; |
| int best_candidate_index = -1; |
| |
| float best_d_ssim = BIG_FLOAT_VAL; |
| |
| if (global_cfg.m_lambda == 0.0f) |
| { |
| for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) |
| { |
| const auto& candidate = candidates[candidate_iter]; |
| |
| float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]); |
| |
| if (candidate_d_ssim < best_d_ssim) |
| best_d_ssim = candidate_d_ssim; |
| |
| candidate_d_ssim *= SSIM_WEIGHT; |
| |
| float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment); |
| |
| candidate_mse += candidate_d_ssim; |
| |
| float total_deblock_penalty = 0.0f; |
| if (global_cfg.m_deblocking_flag) |
| { |
| total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight; |
| } |
| candidate_mse += total_deblock_penalty * SSIM_WEIGHT; |
| |
| if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse)) |
| { |
| // Bias the encoder away from 2 level blocks on complex blocks |
| // TODO: Perhaps only do this on large or non-interpolated grids |
| if (complex_block) |
| { |
| if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS) |
| { |
| candidate_mse *= TWO_LEVEL_PENALTY; |
| } |
| } |
| |
| // Bias the encoder away from smaller weight grids if the block is very complex |
| // TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling. |
| if (complex_block) |
| { |
| if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2)) |
| candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY; |
| else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3) |
| candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY; |
| else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4) |
| candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY; |
| } |
| } |
| |
| float candidate_t = candidate_mse; |
| |
| if (candidate_t < best_t) |
| { |
| best_t = candidate_t; |
| best_candidate_index = candidate_iter; |
| } |
| |
| } // candidate_iter |
| |
| if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM)) |
| { |
| debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| |
| const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f); |
| |
| if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) && |
| (block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) && |
| (block_avg_y >= 1.5f)) |
| { |
| debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| } |
| else |
| { |
| assert(enc_state.smooth_block_mse_scales.get_width() > 0); |
| |
| // Compute block's perceptual weighting |
| float perceptual_scale = 0.0f; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y)); |
| |
| // Very roughly normalize the computed distortion vs. bits. |
| perceptual_scale *= 10.0f; |
| |
| for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) |
| { |
| auto& candidate = candidates[candidate_iter]; |
| |
| float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]); |
| |
| if (d_ssim < best_d_ssim) |
| best_d_ssim = (float)d_ssim; |
| |
| d_ssim *= SSIM_WEIGHT; |
| |
| float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment); |
| |
| candidate_mse += d_ssim; |
| |
| float total_deblock_penalty = 0.0f; |
| if (global_cfg.m_deblocking_flag) |
| { |
| total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight; |
| } |
| candidate_mse += total_deblock_penalty * SSIM_WEIGHT; |
| |
| if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse)) |
| { |
| // Bias the encoder away from 2 level blocks on complex blocks |
| if (complex_block) |
| { |
| if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS) |
| { |
| candidate_mse *= TWO_LEVEL_PENALTY; |
| } |
| } |
| |
| // Bias the encoder away from smaller weight grids if the block is very complex |
| if (complex_block) |
| { |
| if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2)) |
| candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY; |
| else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3) |
| candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY; |
| else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4) |
| candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY; |
| } |
| } |
| |
| float mode_penalty = 1.0f; |
| if (candidate.m_encoding_type == encoding_type::cSolid) |
| mode_penalty *= SOLID_PENALTY; |
| else if (candidate.m_encoding_type == encoding_type::cReuse) |
| mode_penalty *= REUSE_PENALTY; |
| else if (candidate.m_encoding_type == encoding_type::cRun) |
| mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY); |
| |
| float candidate_bits = (float)candidate.m_coder.get_total_bits(); |
| |
| double candidate_d = (double)candidate_mse * mode_penalty; |
| |
| const float D_POWER = 2.0f; |
| |
| // this value can get VERY large after squaring on random (fuzzed) HDR inputs |
| double candidate_t = perceptual_scale * pow(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f); |
| |
| candidate.m_t = candidate_t; |
| candidate.m_d = candidate_d; |
| candidate.m_bits = candidate_bits; |
| |
| if (candidate_t < best_t) |
| { |
| best_t = candidate_t; |
| best_candidate_index = candidate_iter; |
| } |
| |
| } // candidate_iter |
| |
| if (best_candidate_index < 0) |
| { |
| assert(0); |
| |
| // Should never happen |
| best_candidate_index = 0; |
| } |
| |
| if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM)) |
| { |
| debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| |
| const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f); |
| |
| if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) && |
| (block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) && |
| (block_avg_y >= 1.5f)) |
| { |
| debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed); |
| continue; |
| } |
| |
| if (global_cfg.m_rdo_candidate_diversity_boost) |
| { |
| // candidate diversity boosting - consider candidates along/near the Pareto front |
| const candidate_encoding& comp_candidate = candidates[best_candidate_index]; |
| |
| double best_d = DBL_MAX; |
| |
| for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) |
| { |
| const auto& candidate = candidates[candidate_iter]; |
| |
| if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight) |
| { |
| if (candidate.m_d < best_d) |
| { |
| best_d = candidate.m_d; |
| best_candidate_index = candidate_iter; |
| } |
| } |
| } |
| } |
| |
| // candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that |
| if (global_cfg.m_jnd_optimization) |
| { |
| const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index]; |
| |
| float new_best_candidate_bits = BIG_FLOAT_VAL; |
| int new_best_candidate_index = -1; |
| |
| for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) |
| { |
| if ((int)candidate_iter == best_candidate_index) |
| continue; |
| |
| const auto& candidate = candidates[candidate_iter]; |
| |
| if (candidate.m_bits >= cur_comp_candidate.m_bits) |
| continue; |
| |
| float max_delta_itp = 0.0f; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| { |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| { |
| float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment); |
| max_delta_itp = maximum(max_delta_itp, delta_itp); |
| |
| if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh) |
| goto skip; |
| } |
| } |
| |
| skip: |
| if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh) |
| continue; |
| |
| if (candidate.m_bits < new_best_candidate_bits) |
| { |
| new_best_candidate_bits = candidate.m_bits; |
| new_best_candidate_index = candidate_iter; |
| } |
| } |
| |
| if (new_best_candidate_index != -1) |
| { |
| best_candidate_index = new_best_candidate_index; |
| debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed); |
| } |
| } |
| |
| } // if (lambda == 0.0f) |
| |
| if (global_cfg.m_debug_images) |
| { |
| std::lock_guard<std::mutex> lck(debug_state.m_stat_vis_mutex); |
| debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f)); |
| } |
| |
| if (best_candidate_index < 0) |
| { |
| assert(best_candidate_index >= 0); |
| fmt_error_printf("No candidates!\n"); |
| return false; |
| } |
| |
| const auto& best_candidate = candidates[best_candidate_index]; |
| |
| assert(best_candidate.m_encoding_type != encoding_type::cInvalid); |
| |
| if (best_candidate.m_encoding_type == encoding_type::cRun) |
| { |
| if (!prev_run_len) |
| { |
| if (prev_encoding.get_total_bits()) |
| { |
| #if SYNC_MARKERS |
| strip_coded_bits.put_bits(0xDEAD, 16); |
| #endif |
| |
| strip_coded_bits.append(prev_encoding); |
| } |
| |
| assert(best_candidate.m_coder.get_total_bits()); |
| |
| prev_encoding = best_candidate.m_coder; |
| |
| prev_run_len = 1; |
| } |
| else |
| { |
| prev_run_len++; |
| |
| const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32(); |
| assert(prev_run_bits); |
| BASISU_NOTE_UNUSED(prev_run_bits); |
| |
| const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32(); |
| BASISU_NOTE_UNUSED(num_dummy_bits); |
| |
| // Rewrite the previous encoding to extend the run length. |
| prev_encoding.restart(); |
| prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN); |
| prev_encoding.put_vlc(prev_run_len - 1, 5); |
| |
| assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits); |
| } |
| } |
| else |
| { |
| if (prev_encoding.get_total_bits()) |
| { |
| #if SYNC_MARKERS |
| strip_coded_bits.put_bits(0xDEAD, 16); |
| #endif |
| |
| strip_coded_bits.append(prev_encoding); |
| } |
| |
| prev_encoding = best_candidate.m_coder; |
| prev_run_len = 0; |
| } |
| |
| memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H); |
| |
| prev_candidate_encoding = best_candidate; |
| |
| if (best_candidate.m_encoding_type != encoding_type::cRun) |
| prev_non_run_candidate_encoding = best_candidate; |
| |
| { |
| std::lock_guard<std::mutex> lck(debug_state.m_stats_mutex); |
| |
| debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++; |
| |
| if (best_candidate.m_encoding_type == encoding_type::cBlock) |
| { |
| debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++; |
| } |
| |
| if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock)) |
| { |
| const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode; |
| assert(bm_index < (uint32_t)block_mode::cBMTotalModes); |
| |
| debug_state.m_block_mode_hist[bm_index]++; |
| debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits(); |
| |
| for (uint32_t i = 0; i < 3; i++) |
| { |
| debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]); |
| debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]); |
| } |
| } |
| |
| if (best_candidate.m_encoding_type == encoding_type::cReuse) |
| { |
| debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed); |
| |
| if (best_candidate.m_coded_log_blk.m_dual_plane) |
| debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed); |
| } |
| } |
| |
| enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding; |
| |
| // Update decoded image |
| vec4F decoded_float_pixels[BLOCK_H][BLOCK_W]; |
| for (uint32_t y = 0; y < BLOCK_H; y++) |
| for (uint32_t x = 0; x < BLOCK_W; x++) |
| decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x]; |
| |
| enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); |
| |
| status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr); |
| if (!status) |
| { |
| fmt_error_printf("Failed packing block\n"); |
| return false; |
| } |
| |
| const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed); |
| if ((r & 2047) == 2047) |
| { |
| if (global_cfg.m_status_output) |
| { |
| basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks); |
| } |
| } |
| |
| if ((global_cfg.m_debug_images) && |
| ((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid))) |
| { |
| std::lock_guard<std::mutex> lck(debug_state.m_vis_image_mutex); |
| |
| if (best_candidate.m_decomp_log_blk.m_num_partitions == 2) |
| { |
| const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id]; |
| assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2)); |
| |
| const partition_pattern_vec& pat = g_partitions2[part2_unique_index]; |
| |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| const uint32_t p = pat[x + y * 6]; |
| debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255)); |
| } // x |
| } // y |
| } |
| else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3) |
| { |
| //part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255)); |
| |
| const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id]; |
| assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3)); |
| |
| const partition_pattern_vec& pat = g_partitions3[part3_unique_index]; |
| |
| for (uint32_t y = 0; y < 6; y++) |
| { |
| for (uint32_t x = 0; x < 6; x++) |
| { |
| const uint32_t p = pat[x + y * 6]; |
| color_rgba c(0, 0, 150, 255); |
| if (p == 1) |
| c.set(100, 0, 150, 255); |
| else if (p == 2) |
| c.set(0, 100, 150, 255); |
| debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c); |
| } // x |
| } // y |
| } |
| else if (best_candidate.m_decomp_log_blk.m_dual_plane) |
| { |
| debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255)); |
| } |
| else |
| { |
| debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255)); |
| } |
| |
| color_rgba c; |
| c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36); |
| debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c); |
| |
| c.set(0, 0, 0, 255); |
| if (complex_block) |
| c[0] = 255; |
| |
| if (very_complex_block) |
| c[1] = 255; |
| |
| if (outer_pass == 2) |
| c[2] = 255; |
| else if (outer_pass == 1) |
| c[2] = 128; |
| |
| debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c); |
| |
| c.set(0, 255, 0, 255); |
| if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7) |
| c.set(255, 0, 0, 255); |
| debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c); |
| |
| switch (best_candidate.m_encoding_type) |
| { |
| case encoding_type::cRun: |
| c.set(0, 0, 0, 255); |
| break; |
| case encoding_type::cSolid: |
| c.set(128, 128, 128, 255); // dark grey |
| break; |
| case encoding_type::cReuse: |
| c.set(255, 255, 0, 255); // yellow |
| break; |
| case encoding_type::cBlock: |
| { |
| switch (best_candidate.m_endpoint_mode) |
| { |
| case endpoint_mode::cRaw: |
| c.set(255, 0, 0, 255); // red |
| break; |
| case endpoint_mode::cUseLeft: |
| c.set(0, 0, 255, 255); // blue |
| break; |
| case endpoint_mode::cUseUpper: |
| c.set(0, 0, 192, 255); // darker blue |
| break; |
| case endpoint_mode::cUseLeftDelta: |
| c.set(0, 255, 0, 255); // green |
| break; |
| case endpoint_mode::cUseUpperDelta: |
| c.set(0, 192, 0, 255); // darker green |
| break; |
| default: |
| break; |
| } |
| |
| break; |
| } |
| default: |
| break; |
| } |
| |
| if (filtered_x_err < filtered_y_err) |
| c[3] = 0; |
| else |
| c[3] = 255; |
| |
| debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c); |
| } |
| |
| break; |
| |
| } // outer_pass |
| |
| } // bx |
| |
| } // by |
| |
| if (prev_encoding.get_total_bits()) |
| { |
| #if SYNC_MARKERS |
| strip_coded_bits.put_bits(0xDEAD, 16); |
| #endif |
| |
| strip_coded_bits.append(prev_encoding); |
| } |
| |
| return true; |
| } |
| |
| bool g_initialized = false; |
| |
| void global_init() |
| { |
| if (g_initialized) |
| return; |
| |
| interval_timer tm; |
| tm.start(); |
| |
| init_pq_tables(); |
| |
| init_partitions2_6x6(); |
| init_partitions3_6x6(); |
| |
| init_contrib_lists(); |
| |
| g_initialized = true; |
| |
| //fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs()); |
| } |
| |
| bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool, |
| basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics) |
| { |
| assert(g_initialized); |
| if (!g_initialized) |
| return false; |
| |
| assert(pJob_pool); |
| |
| if (orig_global_cfg.m_debug_output) |
| { |
| fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n"); |
| fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height()); |
| fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads()); |
| orig_global_cfg.print(); |
| } |
| |
| if (!orig_src_img.get_width() || !orig_src_img.get_height()) |
| { |
| assert(false); |
| fmt_error_printf("compress_photo: Invalid source image\n"); |
| return false; |
| } |
| |
| astc_hdr_6x6_global_config global_cfg(orig_global_cfg); |
| |
| uastc_hdr_6x6_encode_state enc_state; |
| enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6; |
| enc_state.src_img = orig_src_img; |
| |
| //src_img.crop(256, 256); |
| |
| const uint32_t width = enc_state.src_img.get_width(); |
| const uint32_t height = enc_state.src_img.get_height(); |
| const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W); |
| const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H); |
| const uint32_t total_blocks = num_blocks_x * num_blocks_y; |
| |
| for (uint32_t y = 0; y < height; y++) |
| { |
| for (uint32_t x = 0; x < width; x++) |
| { |
| for (uint32_t c = 0; c < 3; c++) |
| { |
| float f = enc_state.src_img(x, y)[c]; |
| |
| if (std::isinf(f) || std::isnan(f) || (f < 0.0f)) |
| f = 0; |
| else if (f > basist::ASTC_HDR_MAX_VAL) |
| f = basist::ASTC_HDR_MAX_VAL; |
| |
| enc_state.src_img(x, y)[c] = f; |
| |
| } // c |
| |
| } // x |
| } // y |
| |
| if (global_cfg.m_debug_images) |
| { |
| write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0); |
| } |
| |
| image src_img_compressed; |
| tonemap_image_compressive2(src_img_compressed, enc_state.src_img); |
| |
| if (global_cfg.m_debug_images) |
| { |
| save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed); |
| } |
| |
| smooth_map_params rp; |
| rp.m_debug_images = global_cfg.m_debug_images; |
| |
| if (global_cfg.m_lambda != 0.0f) |
| { |
| if (global_cfg.m_status_output) |
| fmt_printf("Creating RDO perceptual weighting maps\n"); |
| |
| create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp); |
| } |
| |
| if (global_cfg.m_status_output) |
| fmt_printf("Blurring image\n"); |
| |
| enc_state.src_img_filtered1.resize(width, height); |
| image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f); |
| |
| enc_state.src_img_filtered2.resize(width, height); |
| image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f); |
| |
| if (global_cfg.m_debug_images) |
| { |
| write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0); |
| write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0); |
| } |
| |
| if (global_cfg.m_status_output) |
| fmt_printf("Transforming to ITP\n"); |
| |
| enc_state.src_img_itp.resize(width, height); |
| convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg); |
| |
| enc_state.src_img_filtered1_itp.resize(width, height); |
| convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg); |
| |
| enc_state.src_img_filtered2_itp.resize(width, height); |
| convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg); |
| |
| if (global_cfg.m_lambda == 0.0f) |
| global_cfg.m_favor_higher_compression = false; |
| |
| uint32_t total_strips = 0, rows_per_strip = 0; |
| if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg)) |
| { |
| fmt_error_printf("compress_photo: Failed computing strip sizes\n"); |
| return false; |
| } |
| |
| if (global_cfg.m_debug_output) |
| fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag); |
| |
| enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y); |
| |
| bitwise_coder coded_bits; |
| |
| // For Basis v1.60 files write the original marker, otherwise write the new marker. |
| coded_bits.put_bits(global_cfg.m_write_basisu_1_6_compatible_files ? UASTC_6x6_HDR_SIG0 : UASTC_6x6_HDR_SIG1, 16); |
| |
| coded_bits.put_bits(width, 16); |
| coded_bits.put_bits(height, 16); |
| |
| enc_state.packed_img.resize(width, height); |
| |
| enc_state.strip_bits.resize(total_strips); |
| |
| enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y); |
| |
| uastc_hdr_6x6_debug_state debug_state; |
| |
| if (global_cfg.m_debug_images) |
| debug_state.init(width, height); |
| else |
| debug_state.init(0, 0); |
| |
| interval_timer tm; |
| tm.start(); |
| |
| std::atomic_bool any_failed_flag; |
| any_failed_flag.store(false); |
| |
| for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++) |
| { |
| const uint32_t strip_first_by = strip_index * rows_per_strip; |
| |
| uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y); |
| if (strip_index == (total_strips - 1)) |
| strip_last_by = num_blocks_y - 1; |
| |
| pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state, |
| strip_index, total_strips, strip_first_by, strip_last_by, |
| num_blocks_x, num_blocks_y, total_blocks, width, height] |
| { |
| if (!any_failed_flag) |
| { |
| bool status = compress_strip_task( |
| strip_index, total_strips, strip_first_by, strip_last_by, |
| num_blocks_x, num_blocks_y, total_blocks, width, height, |
| global_cfg, debug_state, enc_state); |
| |
| if (!status) |
| { |
| fmt_error_printf("compress_photo: compress_strip_task() failed\n"); |
| any_failed_flag.store(true, std::memory_order_relaxed); |
| } |
| } |
| } ); |
| |
| if (any_failed_flag) |
| break; |
| |
| } // strip_index |
| |
| pJob_pool->wait_for_all(); |
| |
| if (any_failed_flag) |
| { |
| fmt_error_printf("One or more strips failed during compression\n"); |
| return false; |
| } |
| |
| if (global_cfg.m_debug_output) |
| fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs()); |
| |
| if (global_cfg.m_debug_output) |
| debug_state.print(total_blocks); |
| |
| if (global_cfg.m_debug_images) |
| { |
| save_png(global_cfg.m_debug_image_prefix + "part_vis.png", debug_state.m_part_vis); |
| save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis); |
| save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis); |
| save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2); |
| save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis); |
| write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0); |
| } |
| |
| for (uint32_t i = 0; i < total_strips; i++) |
| coded_bits.append(enc_state.strip_bits[i]); |
| |
| coded_bits.put_bits(0xA742, 16); |
| |
| coded_bits.flush(); |
| |
| if (global_cfg.m_output_images) |
| { |
| write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0); |
| } |
| |
| if (global_cfg.m_debug_output) |
| fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height)); |
| |
| vector2D<astc_helpers::astc_block> decoded_blocks1; |
| vector2D<astc_helpers::astc_block> decoded_blocks2; |
| |
| if (global_cfg.m_debug_output) |
| fmt_printf("decode_file\n"); |
| |
| uint32_t unpacked_width = 0, unpacked_height = 0; |
| bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height); |
| if (!status) |
| { |
| fmt_error_printf("decode_file() failed\n"); |
| return false; |
| } |
| |
| if (global_cfg.m_debug_output) |
| fmt_printf("decode_6x6_hdr\n"); |
| |
| status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height); |
| if (!status) |
| { |
| fmt_error_printf("decode_6x6_hdr_file() failed\n"); |
| return false; |
| } |
| |
| if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) || |
| (enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height())) |
| { |
| fmt_error_printf("Decode size mismatch with decode_file\n"); |
| return false; |
| } |
| |
| if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) || |
| (enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height())) |
| { |
| fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n"); |
| return false; |
| } |
| |
| if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0) |
| { |
| fmt_error_printf("Decoded ASTC blocks verification failed\n"); |
| return false; |
| } |
| |
| if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0) |
| { |
| fmt_error_printf("Decoded ASTC blocks verification failed\n"); |
| return false; |
| } |
| |
| if (global_cfg.m_debug_output) |
| basisu::fmt_printf("Decoded ASTC verification checks succeeded\n"); |
| |
| if (global_cfg.m_output_images) |
| { |
| if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height)) |
| { |
| basisu::platform_sleep(20); |
| |
| uint8_vec astc_file_data; |
| if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data)) |
| { |
| if (astc_file_data.size() > 16) |
| { |
| astc_file_data.erase(0, 16); |
| |
| size_t comp_size = 0; |
| void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK); |
| mz_free(pComp_data); |
| |
| if (global_cfg.m_debug_output) |
| { |
| fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n", |
| (uint64_t)astc_file_data.size(), |
| (float)astc_file_data.size() * 8.0f / (float)(width * height), |
| (float)comp_size * 8.0f / (float)(width * height)); |
| } |
| } |
| } |
| } |
| } |
| |
| // Must decode all the blocks (even padded rows/cols) to match what the transcoder does. |
| imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6); |
| imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6); |
| |
| for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++) |
| { |
| for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++) |
| { |
| const auto& phys_blk = decoded_blocks1(x, y); |
| |
| vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H]; |
| status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels); |
| if (!status) |
| { |
| fmt_error_printf("unpack_physical_astc_block() failed\n"); |
| return false; |
| } |
| |
| unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H); |
| |
| vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H]; |
| status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google); |
| if (!status) |
| { |
| fmt_error_printf("unpack_physical_astc_block_google() failed\n"); |
| return false; |
| } |
| |
| unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H); |
| |
| for (uint32_t i = 0; i < 36; i++) |
| { |
| if (pixels[i] != pixels_google[i]) |
| { |
| fmt_error_printf("pixel unpack mismatch\n"); |
| return false; |
| } |
| } |
| } |
| } |
| |
| if (global_cfg.m_debug_output) |
| fmt_printf("\nUnpack succeeded\n"); |
| |
| imagef unpacked_bc6h_img; |
| |
| { |
| vector2D<basist::bc6h_block> bc6h_blocks; |
| |
| fast_bc6h_params enc_params; |
| |
| bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params); |
| if (!pack_status) |
| { |
| fmt_error_printf("pack_bc6h_image() failed!"); |
| return false; |
| } |
| |
| unpacked_bc6h_img.crop(width, height); |
| |
| if (global_cfg.m_output_images) |
| { |
| write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0); |
| } |
| } |
| |
| unpacked_astc_img.crop(width, height); |
| unpacked_astc_google_img.crop(width, height); |
| |
| if (global_cfg.m_output_images) |
| { |
| write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0); |
| write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0); |
| } |
| |
| // ASTC metrics |
| if (global_cfg.m_image_stats) |
| { |
| image_metrics im; |
| |
| if (global_cfg.m_debug_output) |
| printf("\nASTC log2 float error metrics:\n"); |
| |
| for (uint32_t i = 0; i < 3; i++) |
| { |
| im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true); |
| |
| if (global_cfg.m_debug_output) |
| { |
| printf("%c: ", "RGBA"[i]); |
| im.print_hp(); |
| } |
| } |
| |
| metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true); |
| |
| if (global_cfg.m_debug_output) |
| { |
| printf("RGB: "); |
| metrics.m_im_astc_log2.print_hp(); |
| |
| printf("\n"); |
| } |
| } |
| |
| if (global_cfg.m_image_stats) |
| { |
| image_metrics im; |
| |
| if (global_cfg.m_debug_output) |
| printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n"); |
| |
| for (uint32_t i = 0; i < 3; i++) |
| { |
| im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true); |
| |
| if (global_cfg.m_debug_output) |
| { |
| printf("%c: ", "RGBA"[i]); |
| im.print_hp(); |
| } |
| } |
| |
| metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true); |
| |
| if (global_cfg.m_debug_output) |
| { |
| printf("RGB: "); |
| metrics.m_im_astc_half.print_hp(); |
| } |
| } |
| |
| // BC6H metrics |
| if (global_cfg.m_image_stats) |
| { |
| image_metrics im; |
| |
| if (global_cfg.m_debug_output) |
| printf("\nBC6H log2 float error metrics:\n"); |
| |
| for (uint32_t i = 0; i < 3; i++) |
| { |
| im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true); |
| |
| if (global_cfg.m_debug_output) |
| { |
| printf("%c: ", "RGBA"[i]); |
| im.print_hp(); |
| } |
| } |
| |
| metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true); |
| |
| if (global_cfg.m_debug_output) |
| { |
| printf("RGB: "); |
| metrics.m_im_bc6h_log2.print_hp(); |
| |
| printf("\n"); |
| } |
| } |
| |
| if (global_cfg.m_image_stats) |
| { |
| image_metrics im; |
| |
| if (global_cfg.m_debug_output) |
| printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n"); |
| |
| for (uint32_t i = 0; i < 3; i++) |
| { |
| im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true); |
| |
| if (global_cfg.m_debug_output) |
| { |
| printf("%c: ", "RGBA"[i]); |
| im.print_hp(); |
| } |
| } |
| |
| metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true); |
| |
| if (global_cfg.m_debug_output) |
| { |
| printf("RGB: "); |
| metrics.m_im_bc6h_half.print_hp(); |
| |
| printf("\n"); |
| } |
| } |
| |
| intermediate_tex_data.swap(coded_bits.get_bytes()); |
| |
| astc_tex_data.resize(decoded_blocks1.size_in_bytes()); |
| memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes()); |
| |
| return true; |
| } |
| |
| } // namespace astc_6x6_hdr |