UASTC HDR merge
diff --git a/encoder/3rdparty/android_astc_decomp.cpp b/encoder/3rdparty/android_astc_decomp.cpp
new file mode 100644
index 0000000..5abfe2f
--- /dev/null
+++ b/encoder/3rdparty/android_astc_decomp.cpp
@@ -0,0 +1,2052 @@
+// File: android_astc_decomp.cpp
+
+/*-------------------------------------------------------------------------
+ * drawElements Quality Program Tester Core
+ * ----------------------------------------
+ *
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * rg: Removed external dependencies, minor fix to decompress() so it converts non-sRGB
+ * output to 8-bits correctly. I've compared this decoder's output
+ * vs. astc-codec with random inputs.
+ * 
+ *//*!
+ * \file
+ * \brief ASTC Utilities.
+ *//*--------------------------------------------------------------------*/
+#include "android_astc_decomp.h"
+#include <assert.h>
+#include <algorithm>
+#include <fenv.h>
+#include <math.h>
+
+#define DE_LENGTH_OF_ARRAY(x) (sizeof(x)/sizeof(x[0]))
+#define DE_UNREF(x) (void)x
+
+typedef uint8_t deUint8;
+typedef int8_t deInt8;
+typedef uint32_t deUint32;
+typedef int32_t deInt32;
+typedef uint16_t deUint16;
+typedef int16_t deInt16;
+typedef int64_t deInt64;
+typedef uint64_t deUint64;
+
+#define DE_ASSERT assert
+
+#ifdef _MSC_VER
+#pragma warning (disable:4505) // unreferenced local function has been removed
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+namespace basisu_astc
+{
+    template <typename S> inline S maximum(S a, S b) { return (a > b) ? a : b; }
+    template <typename S> inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
+    template <typename S> inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
+
+    static bool inBounds(int v, int l, int h)
+    {
+        return (v >= l) && (v < h);
+    }
+
+    static bool inRange(int v, int l, int h)
+    {
+        return (v >= l) && (v <= h);
+    }
+
+    template<typename T>
+    static inline T max(T a, T b)
+    {
+        return (a > b) ? a : b;
+    }
+
+    template<typename T>
+    static inline T min(T a, T b)
+    {
+        return (a < b) ? a : b;
+    }
+
+    template<typename T>
+    static inline T clamp(T a, T l, T h)
+    {
+        if (a < l)
+            return l;
+        else if (a > h)
+            return h;
+        return a;
+    }
+
+    struct UVec4
+    {
+        uint32_t m_c[4];
+
+        UVec4()
+        {
+            m_c[0] = 0;
+            m_c[1] = 0;
+            m_c[2] = 0;
+            m_c[3] = 0;
+        }
+
+        UVec4(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
+        {
+            m_c[0] = x;
+            m_c[1] = y;
+            m_c[2] = z;
+            m_c[3] = w;
+        }
+
+        uint32_t x() const { return m_c[0]; }
+        uint32_t y() const { return m_c[1]; }
+        uint32_t z() const { return m_c[2]; }
+        uint32_t w() const { return m_c[3]; }
+
+        uint32_t& x() { return m_c[0]; }
+        uint32_t& y() { return m_c[1]; }
+        uint32_t& z() { return m_c[2]; }
+        uint32_t& w() { return m_c[3]; }
+
+        uint32_t operator[] (uint32_t idx) const { assert(idx < 4);  return m_c[idx]; }
+        uint32_t& operator[] (uint32_t idx) { assert(idx < 4);  return m_c[idx]; }
+    };
+
+    struct IVec4
+    {
+        int32_t m_c[4];
+
+        IVec4()
+        {
+            m_c[0] = 0;
+            m_c[1] = 0;
+            m_c[2] = 0;
+            m_c[3] = 0;
+        }
+
+        IVec4(int32_t x, int32_t y, int32_t z, int32_t w)
+        {
+            m_c[0] = x;
+            m_c[1] = y;
+            m_c[2] = z;
+            m_c[3] = w;
+        }
+
+        int32_t x() const { return m_c[0]; }
+        int32_t y() const { return m_c[1]; }
+        int32_t z() const { return m_c[2]; }
+        int32_t w() const { return m_c[3]; }
+
+        int32_t& x() { return m_c[0]; }
+        int32_t& y() { return m_c[1]; }
+        int32_t& z() { return m_c[2]; }
+        int32_t& w() { return m_c[3]; }
+
+        UVec4 asUint() const
+        {
+            return UVec4(maximum(0, m_c[0]), maximum(0, m_c[1]), maximum(0, m_c[2]), maximum(0, m_c[3]));
+        }
+
+        int32_t operator[] (uint32_t idx) const { assert(idx < 4);  return m_c[idx]; }
+        int32_t& operator[] (uint32_t idx) { assert(idx < 4);  return m_c[idx]; }
+    };
+
+    struct IVec3
+    {
+        int32_t m_c[3];
+
+        IVec3()
+        {
+            m_c[0] = 0;
+            m_c[1] = 0;
+            m_c[2] = 0;
+        }
+
+        IVec3(int32_t x, int32_t y, int32_t z)
+        {
+            m_c[0] = x;
+            m_c[1] = y;
+            m_c[2] = z;
+        }
+
+        int32_t x() const { return m_c[0]; }
+        int32_t y() const { return m_c[1]; }
+        int32_t z() const { return m_c[2]; }
+
+        int32_t& x() { return m_c[0]; }
+        int32_t& y() { return m_c[1]; }
+        int32_t& z() { return m_c[2]; }
+
+        int32_t operator[] (uint32_t idx) const { assert(idx < 3);  return m_c[idx]; }
+        int32_t& operator[] (uint32_t idx) { assert(idx < 3);  return m_c[idx]; }
+    };
+
+    static uint32_t deDivRoundUp32(uint32_t a, uint32_t b)
+    {
+        return (a + b - 1) / b;
+    }
+
+    static bool deInBounds32(uint32_t v, uint32_t l, uint32_t h)
+    {
+        return (v >= l) && (v < h);
+    }
+
+namespace astc 
+{
+
+using std::vector;
+
+namespace
+{
+
+// Common utilities
+enum
+{
+    MAX_BLOCK_WIDTH     = 12,
+    MAX_BLOCK_HEIGHT    = 12
+};
+
+inline deUint32 getBit (deUint32 src, int ndx)
+{
+    DE_ASSERT(basisu_astc::inBounds(ndx, 0, 32));
+    return (src >> ndx) & 1;
+}
+
+inline deUint32 getBits (deUint32 src, int low, int high)
+{
+    const int numBits = (high-low) + 1;
+    DE_ASSERT(basisu_astc::inRange(numBits, 1, 32));
+
+    if (numBits < 32)
+        return (deUint32)((src >> low) & ((1u<<numBits)-1));
+    else
+        return (deUint32)((src >> low) & 0xFFFFFFFFu);
+}
+
+inline bool isBitSet (deUint32 src, int ndx)
+{
+    return getBit(src, ndx) != 0;
+}
+
+inline deUint32 reverseBits (deUint32 src, int numBits)
+{
+    DE_ASSERT(basisu_astc::inRange(numBits, 0, 32));
+    
+    deUint32 result = 0;
+    for (int i = 0; i < numBits; i++)
+        result |= ((src >> i) & 1) << (numBits-1-i);
+
+    return result;
+}
+
+inline deUint32 bitReplicationScale (deUint32 src, int numSrcBits, int numDstBits)
+{
+    DE_ASSERT(numSrcBits <= numDstBits);
+    DE_ASSERT((src & ((1<<numSrcBits)-1)) == src);
+
+    deUint32 dst = 0;
+    for (int shift = numDstBits-numSrcBits; shift > -numSrcBits; shift -= numSrcBits)
+        dst |= (shift >= 0) ? (src << shift) : (src >> -shift);
+
+    return dst;
+}
+
+inline deInt32 signExtend (deInt32 src, int numSrcBits)
+{
+    DE_ASSERT(basisu_astc::inRange(numSrcBits, 2, 31));
+
+    const bool negative = (src & (1 << (numSrcBits-1))) != 0;
+    return src | (negative ? ~((1 << numSrcBits) - 1) : 0);
+}
+
+typedef uint16_t deFloat16;
+
+inline bool isFloat16InfOrNan (deFloat16 v)
+{
+    return getBits(v, 10, 14) == 31;
+}
+
+float deFloat16To32(deFloat16 val16)
+{
+    deUint32 sign;
+    deUint32 expotent;
+    deUint32 mantissa;
+
+    union
+    {
+        float       f;
+        deUint32    u;
+    } x;
+
+    x.u = 0u;
+
+    sign = ((deUint32)val16 >> 15u) & 0x00000001u;
+    expotent = ((deUint32)val16 >> 10u) & 0x0000001fu;
+    mantissa = (deUint32)val16 & 0x000003ffu;
+
+    if (expotent == 0u)
+    {
+        if (mantissa == 0u)
+        {
+            /* +/- 0 */
+            x.u = sign << 31u;
+            return x.f;
+        }
+        else
+        {
+            /* Denormalized, normalize it. */
+
+            while (!(mantissa & 0x00000400u))
+            {
+                mantissa <<= 1u;
+                expotent -= 1u;
+            }
+
+            expotent += 1u;
+            mantissa &= ~0x00000400u;
+        }
+    }
+    else if (expotent == 31u)
+    {
+        if (mantissa == 0u)
+        {
+            /* +/- InF */
+            x.u = (sign << 31u) | 0x7f800000u;
+            return x.f;
+        }
+        else
+        {
+            /* +/- NaN */
+            x.u = (sign << 31u) | 0x7f800000u | (mantissa << 13u);
+            return x.f;
+        }
+    }
+
+    expotent = expotent + (127u - 15u);
+    mantissa = mantissa << 13u;
+
+    x.u = (sign << 31u) | (expotent << 23u) | mantissa;
+    return x.f;
+}
+
+enum ISEMode
+{
+    ISEMODE_TRIT = 0,
+    ISEMODE_QUINT,
+    ISEMODE_PLAIN_BIT,
+    ISEMODE_LAST
+};
+
+struct ISEParams
+{
+    ISEMode     mode;
+    int         numBits;
+    ISEParams (ISEMode mode_, int numBits_) : mode(mode_), numBits(numBits_) {}
+};
+
+inline int computeNumRequiredBits (const ISEParams& iseParams, int numValues)
+{
+    switch (iseParams.mode)
+    {
+        case ISEMODE_TRIT:          return deDivRoundUp32(numValues*8, 5) + numValues*iseParams.numBits;
+        case ISEMODE_QUINT:         return deDivRoundUp32(numValues*7, 3) + numValues*iseParams.numBits;
+        case ISEMODE_PLAIN_BIT:     return numValues*iseParams.numBits;
+        default:
+            DE_ASSERT(false);
+            return -1;
+    }
+}
+
+ISEParams computeMaximumRangeISEParams (int numAvailableBits, int numValuesInSequence)
+{
+    int curBitsForTritMode      = 6;
+    int curBitsForQuintMode     = 5;
+    int curBitsForPlainBitMode  = 8;
+
+    while (true)
+    {
+        DE_ASSERT(curBitsForTritMode > 0 || curBitsForQuintMode > 0 || curBitsForPlainBitMode > 0);
+        const int tritRange         = (curBitsForTritMode > 0)        ? (3 << curBitsForTritMode) - 1         : -1;
+        const int quintRange        = (curBitsForQuintMode > 0)       ? (5 << curBitsForQuintMode) - 1        : -1;
+        const int plainBitRange     = (curBitsForPlainBitMode > 0)    ? (1 << curBitsForPlainBitMode) - 1     : -1;
+        const int maxRange          = basisu_astc::max(basisu_astc::max(tritRange, quintRange), plainBitRange);
+
+        if (maxRange == tritRange)
+        {
+            const ISEParams params(ISEMODE_TRIT, curBitsForTritMode);
+
+            if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+                return ISEParams(ISEMODE_TRIT, curBitsForTritMode);
+
+            curBitsForTritMode--;
+        }
+        else if (maxRange == quintRange)
+        {
+            const ISEParams params(ISEMODE_QUINT, curBitsForQuintMode);
+
+            if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+                return ISEParams(ISEMODE_QUINT, curBitsForQuintMode);
+
+            curBitsForQuintMode--;
+        }
+        else
+        {
+            const ISEParams params(ISEMODE_PLAIN_BIT, curBitsForPlainBitMode);
+            DE_ASSERT(maxRange == plainBitRange);
+
+            if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+                return ISEParams(ISEMODE_PLAIN_BIT, curBitsForPlainBitMode);
+
+            curBitsForPlainBitMode--;
+        }
+    }
+}
+
+inline int computeNumColorEndpointValues (deUint32 endpointMode)
+{
+    DE_ASSERT(endpointMode < 16);
+    return (endpointMode/4 + 1) * 2;
+}
+
+// Decompression utilities
+enum DecompressResult
+{
+    DECOMPRESS_RESULT_VALID_BLOCK   = 0,    //!< Decompressed valid block
+    DECOMPRESS_RESULT_ERROR,                //!< Encountered error while decompressing, error color written
+    DECOMPRESS_RESULT_LAST
+};
+
+// A helper for getting bits from a 128-bit block.
+class Block128
+{
+private:
+    typedef deUint64 Word;
+
+    enum
+    {
+        WORD_BYTES  = sizeof(Word),
+        WORD_BITS   = 8*WORD_BYTES,
+        NUM_WORDS   = 128 / WORD_BITS
+    };
+    //DE_STATIC_ASSERT(128 % WORD_BITS == 0);
+
+public:
+    Block128 (const deUint8* src)
+    {
+        for (int wordNdx = 0; wordNdx < NUM_WORDS; wordNdx++)
+        {
+            m_words[wordNdx] = 0;
+            for (int byteNdx = 0; byteNdx < WORD_BYTES; byteNdx++)
+                m_words[wordNdx] |= (Word)src[wordNdx*WORD_BYTES + byteNdx] << (8*byteNdx);
+        }
+    }
+
+    deUint32 getBit (int ndx) const
+    {
+        DE_ASSERT(basisu_astc::inBounds(ndx, 0, 128));
+        return (m_words[ndx / WORD_BITS] >> (ndx % WORD_BITS)) & 1;
+    }
+
+    deUint32 getBits (int low, int high) const
+    {
+        DE_ASSERT(basisu_astc::inBounds(low, 0, 128));
+        DE_ASSERT(basisu_astc::inBounds(high, 0, 128));
+        DE_ASSERT(basisu_astc::inRange(high-low+1, 0, 32));
+
+        if (high-low+1 == 0)
+            return 0;
+
+        const int word0Ndx = low / WORD_BITS;
+        const int word1Ndx = high / WORD_BITS;
+        // \note "foo << bar << 1" done instead of "foo << (bar+1)" to avoid overflow, i.e. shift amount being too big.
+        if (word0Ndx == word1Ndx)
+            return (deUint32)((m_words[word0Ndx] & ((((Word)1 << high%WORD_BITS << 1) - 1))) >> ((Word)low % WORD_BITS));
+        else
+        {
+            DE_ASSERT(word1Ndx == word0Ndx + 1);
+            return (deUint32)(m_words[word0Ndx] >> (low%WORD_BITS)) |
+                   (deUint32)((m_words[word1Ndx] & (((Word)1 << high%WORD_BITS << 1) - 1)) << (high-low - high%WORD_BITS));
+        }
+    }
+
+    bool isBitSet (int ndx) const
+    {
+        DE_ASSERT(basisu_astc::inBounds(ndx, 0, 128));
+        return getBit(ndx) != 0;
+    }
+
+private:
+    Word m_words[NUM_WORDS];
+};
+
+// A helper for sequential access into a Block128.
+class BitAccessStream
+{
+public:
+    BitAccessStream (const Block128& src, int startNdxInSrc, int length, bool forward)
+        : m_src             (src)
+        , m_startNdxInSrc   (startNdxInSrc)
+        , m_length          (length)
+        , m_forward         (forward)
+        , m_ndx             (0)
+    {
+    }
+
+    // Get the next num bits. Bits at positions greater than or equal to m_length are zeros.
+    deUint32 getNext (int num)
+    {
+        if (num == 0 || m_ndx >= m_length)
+            return 0;
+        const int end               = m_ndx + num;
+        const int numBitsFromSrc    = basisu_astc::max(0, basisu_astc::min(m_length, end) - m_ndx);
+        const int low               = m_ndx;
+        const int high              = m_ndx + numBitsFromSrc - 1;
+
+        m_ndx += num;
+        
+        return m_forward ?             m_src.getBits(m_startNdxInSrc + low,  m_startNdxInSrc + high)
+                         : reverseBits(m_src.getBits(m_startNdxInSrc - high, m_startNdxInSrc - low), numBitsFromSrc);
+    }
+
+private:
+    const Block128&     m_src;
+    const int           m_startNdxInSrc;
+    const int           m_length;
+    const bool          m_forward;
+    int                 m_ndx;
+};
+
+struct ISEDecodedResult
+{
+    deUint32 m;
+    deUint32 tq; //!< Trit or quint value, depending on ISE mode.
+    deUint32 v;
+};
+
+// Data from an ASTC block's "block mode" part (i.e. bits [0,10]).
+struct ASTCBlockMode
+{
+    bool        isError;
+    // \note Following fields only relevant if !isError.
+    bool        isVoidExtent;
+    // \note Following fields only relevant if !isVoidExtent.
+    bool        isDualPlane;
+    int         weightGridWidth;
+    int         weightGridHeight;
+    ISEParams   weightISEParams;
+
+    ASTCBlockMode (void)
+        : isError           (true)
+        , isVoidExtent      (true)
+        , isDualPlane       (true)
+        , weightGridWidth   (-1)
+        , weightGridHeight  (-1)
+        , weightISEParams   (ISEMODE_LAST, -1)
+    {
+    }
+};
+
+inline int computeNumWeights (const ASTCBlockMode& mode)
+{
+    return mode.weightGridWidth * mode.weightGridHeight * (mode.isDualPlane ? 2 : 1);
+}
+
+struct ColorEndpointPair
+{
+    UVec4 e0;
+    UVec4 e1;
+};
+
+struct TexelWeightPair
+{
+    deUint32 w[2];
+};
+
+ASTCBlockMode getASTCBlockMode (deUint32 blockModeData)
+{
+    ASTCBlockMode blockMode;
+    blockMode.isError = true; // \note Set to false later, if not error.
+    blockMode.isVoidExtent = getBits(blockModeData, 0, 8) == 0x1fc;
+    if (!blockMode.isVoidExtent)
+    {
+        if ((getBits(blockModeData, 0, 1) == 0 && getBits(blockModeData, 6, 8) == 7) || getBits(blockModeData, 0, 3) == 0)
+            return blockMode; // Invalid ("reserved").
+
+        deUint32 r = (deUint32)-1; // \note Set in the following branches.
+
+        if (getBits(blockModeData, 0, 1) == 0)
+        {
+            const deUint32 r0   = getBit(blockModeData, 4);
+            const deUint32 r1   = getBit(blockModeData, 2);
+            const deUint32 r2   = getBit(blockModeData, 3);
+            const deUint32 i78  = getBits(blockModeData, 7, 8);
+            
+            r = (r2 << 2) | (r1 << 1) | (r0 << 0);
+
+            if (i78 == 3)
+            {
+                const bool i5 = isBitSet(blockModeData, 5);
+                blockMode.weightGridWidth   = i5 ? 10 : 6;
+                blockMode.weightGridHeight  = i5 ? 6  : 10;
+            }
+            else
+            {
+                const deUint32 a = getBits(blockModeData, 5, 6);
+
+                switch (i78)
+                {
+                    case 0:     blockMode.weightGridWidth = 12;     blockMode.weightGridHeight = a + 2;                                 break;
+                    case 1:     blockMode.weightGridWidth = a + 2;  blockMode.weightGridHeight = 12;                                    break;
+                    case 2:     blockMode.weightGridWidth = a + 6;  blockMode.weightGridHeight = getBits(blockModeData, 9, 10) + 6;     break;
+                    default: DE_ASSERT(false);
+                }
+            }
+        }
+        else
+        {
+            const deUint32 r0   = getBit(blockModeData, 4);
+            const deUint32 r1   = getBit(blockModeData, 0);
+            const deUint32 r2   = getBit(blockModeData, 1);
+            const deUint32 i23  = getBits(blockModeData, 2, 3);
+            const deUint32 a    = getBits(blockModeData, 5, 6);
+
+            r = (r2 << 2) | (r1 << 1) | (r0 << 0);
+            if (i23 == 3)
+            {
+                const deUint32  b   = getBit(blockModeData, 7);
+                const bool      i8  = isBitSet(blockModeData, 8);
+                blockMode.weightGridWidth   = i8 ? b+2 : a+2;
+                blockMode.weightGridHeight  = i8 ? a+2 : b+6;
+            }
+            else
+            {
+                const deUint32 b = getBits(blockModeData, 7, 8);
+                switch (i23)
+                {
+                    case 0:     blockMode.weightGridWidth = b + 4;  blockMode.weightGridHeight = a + 2; break;
+                    case 1:     blockMode.weightGridWidth = b + 8;  blockMode.weightGridHeight = a + 2; break;
+                    case 2:     blockMode.weightGridWidth = a + 2;  blockMode.weightGridHeight = b + 8; break;
+                    default: DE_ASSERT(false);
+                }
+            }
+        }
+
+        const bool  zeroDH      = getBits(blockModeData, 0, 1) == 0 && getBits(blockModeData, 7, 8) == 2;
+        const bool  h           = zeroDH ? 0 : isBitSet(blockModeData, 9);
+        blockMode.isDualPlane   = zeroDH ? 0 : isBitSet(blockModeData, 10);
+
+        {
+            ISEMode&    m   = blockMode.weightISEParams.mode;
+            int&        b   = blockMode.weightISEParams.numBits;
+            m = ISEMODE_PLAIN_BIT;
+            b = 0;
+            if (h)
+            {
+                switch (r)
+                {
+                    case 2:                         m = ISEMODE_QUINT;  b = 1;  break;
+                    case 3:     m = ISEMODE_TRIT;                       b = 2;  break;
+                    case 4:                                             b = 4;  break;
+                    case 5:                         m = ISEMODE_QUINT;  b = 2;  break;
+                    case 6:     m = ISEMODE_TRIT;                       b = 3;  break;
+                    case 7:                                             b = 5;  break;
+                    default:    DE_ASSERT(false);
+                }
+            }
+            else
+            {
+                switch (r)
+                {
+                    case 2:                                             b = 1;  break;
+                    case 3:     m = ISEMODE_TRIT;                               break;
+                    case 4:                                             b = 2;  break;
+                    case 5:                         m = ISEMODE_QUINT;          break;
+                    case 6:     m = ISEMODE_TRIT;                       b = 1;  break;
+                    case 7:                                             b = 3;  break;
+                    default:    DE_ASSERT(false);
+                }
+            }
+        }
+    }
+
+    blockMode.isError = false;
+    return blockMode;
+}
+
+inline void setASTCErrorColorBlock (void* dst, int blockWidth, int blockHeight, bool isSRGB)
+{
+    if (isSRGB)
+    {
+        deUint8* const dstU = (deUint8*)dst;
+        for (int i = 0; i < blockWidth*blockHeight; i++)
+        {
+            dstU[4*i + 0] = 0xff;
+            dstU[4*i + 1] = 0;
+            dstU[4*i + 2] = 0xff;
+            dstU[4*i + 3] = 0xff;
+        }
+    }
+    else
+    {
+        float* const dstF = (float*)dst;
+        for (int i = 0; i < blockWidth*blockHeight; i++)
+        {
+            dstF[4*i + 0] = 1.0f;
+            dstF[4*i + 1] = 0.0f;
+            dstF[4*i + 2] = 1.0f;
+            dstF[4*i + 3] = 1.0f;
+        }
+    }
+}
+
+DecompressResult decodeVoidExtentBlock (void* dst, const Block128& blockData, int blockWidth, int blockHeight, bool isSRGB, bool isLDRMode)
+{
+    const deUint32  minSExtent          = blockData.getBits(12, 24);
+    const deUint32  maxSExtent          = blockData.getBits(25, 37);
+    const deUint32  minTExtent          = blockData.getBits(38, 50);
+    const deUint32  maxTExtent          = blockData.getBits(51, 63);
+    const bool      allExtentsAllOnes   = (minSExtent == 0x1fff) && (maxSExtent == 0x1fff) && (minTExtent == 0x1fff) && (maxTExtent == 0x1fff);
+    const bool      isHDRBlock          = blockData.isBitSet(9);
+    
+    if ((isLDRMode && isHDRBlock) || (!allExtentsAllOnes && (minSExtent >= maxSExtent || minTExtent >= maxTExtent)))
+    {
+        setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+        return DECOMPRESS_RESULT_ERROR;
+    }
+    
+    const deUint32 rgba[4] =
+    {
+        blockData.getBits(64,  79),
+        blockData.getBits(80,  95),
+        blockData.getBits(96,  111),
+        blockData.getBits(112, 127)
+    };
+
+    if (isSRGB)
+    {
+        deUint8* const dstU = (deUint8*)dst;
+        for (int i = 0; i < blockWidth * blockHeight; i++)
+        {
+            for (int c = 0; c < 4; c++)
+                dstU[i * 4 + c] = (deUint8)((rgba[c] & 0xff00) >> 8);
+        }
+    }
+    else
+    {
+        float* const dstF = (float*)dst;
+
+        if (isHDRBlock)
+        {
+            for (int c = 0; c < 4; c++)
+            {
+                if (isFloat16InfOrNan((deFloat16)rgba[c]))
+                {
+                    //throw InternalError("Infinity or NaN color component in HDR void extent block in ASTC texture (behavior undefined by ASTC specification)");
+                    setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+                    return DECOMPRESS_RESULT_ERROR;
+                }
+            }
+
+            for (int i = 0; i < blockWidth * blockHeight; i++)
+            {
+                for (int c = 0; c < 4; c++)
+                    dstF[i * 4 + c] = deFloat16To32((deFloat16)rgba[c]);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < blockWidth * blockHeight; i++)
+            {
+                for (int c = 0; c < 4; c++)
+                    dstF[i * 4 + c] = (rgba[c] == 65535) ? 1.0f : ((float)rgba[c] / 65536.0f);
+            }
+        }
+    }
+
+    return DECOMPRESS_RESULT_VALID_BLOCK;
+}
+
+void decodeColorEndpointModes (deUint32* endpointModesDst, const Block128& blockData, int numPartitions, int extraCemBitsStart)
+{
+    if (numPartitions == 1)
+        endpointModesDst[0] = blockData.getBits(13, 16);
+    else
+    {
+        const deUint32 highLevelSelector = blockData.getBits(23, 24);
+
+        if (highLevelSelector == 0)
+        {
+            const deUint32 mode = blockData.getBits(25, 28);
+
+            for (int i = 0; i < numPartitions; i++)
+                endpointModesDst[i] = mode;
+        }
+        else
+        {
+            for (int partNdx = 0; partNdx < numPartitions; partNdx++)
+            {
+                const deUint32 cemClass     = highLevelSelector - (blockData.isBitSet(25 + partNdx) ? 0 : 1);
+                const deUint32 lowBit0Ndx   = numPartitions + 2*partNdx;
+                const deUint32 lowBit1Ndx   = numPartitions + 2*partNdx + 1;
+                const deUint32 lowBit0      = blockData.getBit(lowBit0Ndx < 4 ? 25+lowBit0Ndx : extraCemBitsStart+lowBit0Ndx-4);
+                const deUint32 lowBit1      = blockData.getBit(lowBit1Ndx < 4 ? 25+lowBit1Ndx : extraCemBitsStart+lowBit1Ndx-4);
+
+                endpointModesDst[partNdx] = (cemClass << 2) | (lowBit1 << 1) | lowBit0;
+            }
+        }
+    }
+}
+
+int computeNumColorEndpointValues (const deUint32* endpointModes, int numPartitions)
+{
+    int result = 0;
+
+    for (int i = 0; i < numPartitions; i++)
+        result += computeNumColorEndpointValues(endpointModes[i]);
+
+    return result;
+}
+
+void decodeISETritBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& data, int numBits)
+{
+    DE_ASSERT(basisu_astc::inRange(numValues, 1, 5));
+
+    deUint32 m[5];
+    m[0]            = data.getNext(numBits);
+    deUint32 T01    = data.getNext(2);
+    m[1]            = data.getNext(numBits);
+    deUint32 T23    = data.getNext(2);
+    m[2]            = data.getNext(numBits);
+    deUint32 T4     = data.getNext(1);
+    m[3]            = data.getNext(numBits);
+    deUint32 T56    = data.getNext(2);
+    m[4]            = data.getNext(numBits);
+    deUint32 T7     = data.getNext(1);
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough="            
+#endif  
+    switch (numValues)
+    {
+        // \note Fall-throughs.
+        case 1: T23     = 0;
+        case 2: T4      = 0;
+        case 3: T56     = 0;
+        case 4: T7      = 0;
+        case 5: break;
+        default:
+            DE_ASSERT(false);
+    }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif 
+
+    const deUint32 T = (T7 << 7) | (T56 << 5) | (T4 << 4) | (T23 << 2) | (T01 << 0);
+
+    static const deUint32 tritsFromT[256][5] =
+    {
+        { 0,0,0,0,0 }, { 1,0,0,0,0 }, { 2,0,0,0,0 }, { 0,0,2,0,0 }, { 0,1,0,0,0 }, { 1,1,0,0,0 }, { 2,1,0,0,0 }, { 1,0,2,0,0 }, { 0,2,0,0,0 }, { 1,2,0,0,0 }, { 2,2,0,0,0 }, { 2,0,2,0,0 }, { 0,2,2,0,0 }, { 1,2,2,0,0 }, { 2,2,2,0,0 }, { 2,0,2,0,0 },
+        { 0,0,1,0,0 }, { 1,0,1,0,0 }, { 2,0,1,0,0 }, { 0,1,2,0,0 }, { 0,1,1,0,0 }, { 1,1,1,0,0 }, { 2,1,1,0,0 }, { 1,1,2,0,0 }, { 0,2,1,0,0 }, { 1,2,1,0,0 }, { 2,2,1,0,0 }, { 2,1,2,0,0 }, { 0,0,0,2,2 }, { 1,0,0,2,2 }, { 2,0,0,2,2 }, { 0,0,2,2,2 },
+        { 0,0,0,1,0 }, { 1,0,0,1,0 }, { 2,0,0,1,0 }, { 0,0,2,1,0 }, { 0,1,0,1,0 }, { 1,1,0,1,0 }, { 2,1,0,1,0 }, { 1,0,2,1,0 }, { 0,2,0,1,0 }, { 1,2,0,1,0 }, { 2,2,0,1,0 }, { 2,0,2,1,0 }, { 0,2,2,1,0 }, { 1,2,2,1,0 }, { 2,2,2,1,0 }, { 2,0,2,1,0 },
+        { 0,0,1,1,0 }, { 1,0,1,1,0 }, { 2,0,1,1,0 }, { 0,1,2,1,0 }, { 0,1,1,1,0 }, { 1,1,1,1,0 }, { 2,1,1,1,0 }, { 1,1,2,1,0 }, { 0,2,1,1,0 }, { 1,2,1,1,0 }, { 2,2,1,1,0 }, { 2,1,2,1,0 }, { 0,1,0,2,2 }, { 1,1,0,2,2 }, { 2,1,0,2,2 }, { 1,0,2,2,2 },
+        { 0,0,0,2,0 }, { 1,0,0,2,0 }, { 2,0,0,2,0 }, { 0,0,2,2,0 }, { 0,1,0,2,0 }, { 1,1,0,2,0 }, { 2,1,0,2,0 }, { 1,0,2,2,0 }, { 0,2,0,2,0 }, { 1,2,0,2,0 }, { 2,2,0,2,0 }, { 2,0,2,2,0 }, { 0,2,2,2,0 }, { 1,2,2,2,0 }, { 2,2,2,2,0 }, { 2,0,2,2,0 },
+        { 0,0,1,2,0 }, { 1,0,1,2,0 }, { 2,0,1,2,0 }, { 0,1,2,2,0 }, { 0,1,1,2,0 }, { 1,1,1,2,0 }, { 2,1,1,2,0 }, { 1,1,2,2,0 }, { 0,2,1,2,0 }, { 1,2,1,2,0 }, { 2,2,1,2,0 }, { 2,1,2,2,0 }, { 0,2,0,2,2 }, { 1,2,0,2,2 }, { 2,2,0,2,2 }, { 2,0,2,2,2 },
+        { 0,0,0,0,2 }, { 1,0,0,0,2 }, { 2,0,0,0,2 }, { 0,0,2,0,2 }, { 0,1,0,0,2 }, { 1,1,0,0,2 }, { 2,1,0,0,2 }, { 1,0,2,0,2 }, { 0,2,0,0,2 }, { 1,2,0,0,2 }, { 2,2,0,0,2 }, { 2,0,2,0,2 }, { 0,2,2,0,2 }, { 1,2,2,0,2 }, { 2,2,2,0,2 }, { 2,0,2,0,2 },
+        { 0,0,1,0,2 }, { 1,0,1,0,2 }, { 2,0,1,0,2 }, { 0,1,2,0,2 }, { 0,1,1,0,2 }, { 1,1,1,0,2 }, { 2,1,1,0,2 }, { 1,1,2,0,2 }, { 0,2,1,0,2 }, { 1,2,1,0,2 }, { 2,2,1,0,2 }, { 2,1,2,0,2 }, { 0,2,2,2,2 }, { 1,2,2,2,2 }, { 2,2,2,2,2 }, { 2,0,2,2,2 },
+        { 0,0,0,0,1 }, { 1,0,0,0,1 }, { 2,0,0,0,1 }, { 0,0,2,0,1 }, { 0,1,0,0,1 }, { 1,1,0,0,1 }, { 2,1,0,0,1 }, { 1,0,2,0,1 }, { 0,2,0,0,1 }, { 1,2,0,0,1 }, { 2,2,0,0,1 }, { 2,0,2,0,1 }, { 0,2,2,0,1 }, { 1,2,2,0,1 }, { 2,2,2,0,1 }, { 2,0,2,0,1 },
+        { 0,0,1,0,1 }, { 1,0,1,0,1 }, { 2,0,1,0,1 }, { 0,1,2,0,1 }, { 0,1,1,0,1 }, { 1,1,1,0,1 }, { 2,1,1,0,1 }, { 1,1,2,0,1 }, { 0,2,1,0,1 }, { 1,2,1,0,1 }, { 2,2,1,0,1 }, { 2,1,2,0,1 }, { 0,0,1,2,2 }, { 1,0,1,2,2 }, { 2,0,1,2,2 }, { 0,1,2,2,2 },
+        { 0,0,0,1,1 }, { 1,0,0,1,1 }, { 2,0,0,1,1 }, { 0,0,2,1,1 }, { 0,1,0,1,1 }, { 1,1,0,1,1 }, { 2,1,0,1,1 }, { 1,0,2,1,1 }, { 0,2,0,1,1 }, { 1,2,0,1,1 }, { 2,2,0,1,1 }, { 2,0,2,1,1 }, { 0,2,2,1,1 }, { 1,2,2,1,1 }, { 2,2,2,1,1 }, { 2,0,2,1,1 },
+        { 0,0,1,1,1 }, { 1,0,1,1,1 }, { 2,0,1,1,1 }, { 0,1,2,1,1 }, { 0,1,1,1,1 }, { 1,1,1,1,1 }, { 2,1,1,1,1 }, { 1,1,2,1,1 }, { 0,2,1,1,1 }, { 1,2,1,1,1 }, { 2,2,1,1,1 }, { 2,1,2,1,1 }, { 0,1,1,2,2 }, { 1,1,1,2,2 }, { 2,1,1,2,2 }, { 1,1,2,2,2 },
+        { 0,0,0,2,1 }, { 1,0,0,2,1 }, { 2,0,0,2,1 }, { 0,0,2,2,1 }, { 0,1,0,2,1 }, { 1,1,0,2,1 }, { 2,1,0,2,1 }, { 1,0,2,2,1 }, { 0,2,0,2,1 }, { 1,2,0,2,1 }, { 2,2,0,2,1 }, { 2,0,2,2,1 }, { 0,2,2,2,1 }, { 1,2,2,2,1 }, { 2,2,2,2,1 }, { 2,0,2,2,1 },
+        { 0,0,1,2,1 }, { 1,0,1,2,1 }, { 2,0,1,2,1 }, { 0,1,2,2,1 }, { 0,1,1,2,1 }, { 1,1,1,2,1 }, { 2,1,1,2,1 }, { 1,1,2,2,1 }, { 0,2,1,2,1 }, { 1,2,1,2,1 }, { 2,2,1,2,1 }, { 2,1,2,2,1 }, { 0,2,1,2,2 }, { 1,2,1,2,2 }, { 2,2,1,2,2 }, { 2,1,2,2,2 },
+        { 0,0,0,1,2 }, { 1,0,0,1,2 }, { 2,0,0,1,2 }, { 0,0,2,1,2 }, { 0,1,0,1,2 }, { 1,1,0,1,2 }, { 2,1,0,1,2 }, { 1,0,2,1,2 }, { 0,2,0,1,2 }, { 1,2,0,1,2 }, { 2,2,0,1,2 }, { 2,0,2,1,2 }, { 0,2,2,1,2 }, { 1,2,2,1,2 }, { 2,2,2,1,2 }, { 2,0,2,1,2 },
+        { 0,0,1,1,2 }, { 1,0,1,1,2 }, { 2,0,1,1,2 }, { 0,1,2,1,2 }, { 0,1,1,1,2 }, { 1,1,1,1,2 }, { 2,1,1,1,2 }, { 1,1,2,1,2 }, { 0,2,1,1,2 }, { 1,2,1,1,2 }, { 2,2,1,1,2 }, { 2,1,2,1,2 }, { 0,2,2,2,2 }, { 1,2,2,2,2 }, { 2,2,2,2,2 }, { 2,1,2,2,2 }
+    };
+
+    const deUint32 (& trits)[5] = tritsFromT[T];
+    for (int i = 0; i < numValues; i++)
+    {
+        dst[i].m    = m[i];
+        dst[i].tq   = trits[i];
+        dst[i].v    = (trits[i] << numBits) + m[i];
+    }
+}
+
+void decodeISEQuintBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& data, int numBits)
+{
+    DE_ASSERT(basisu_astc::inRange(numValues, 1, 3));
+
+    deUint32 m[3];
+    m[0]            = data.getNext(numBits);
+    deUint32 Q012   = data.getNext(3);
+    m[1]            = data.getNext(numBits);
+    deUint32 Q34    = data.getNext(2);
+    m[2]            = data.getNext(numBits);
+    deUint32 Q56    = data.getNext(2);
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough="            
+#endif  
+    switch (numValues)
+    {
+        // \note Fall-throughs.
+        case 1: Q34     = 0;
+        case 2: Q56     = 0;
+        case 3: break;
+        default:
+            DE_ASSERT(false);
+    }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif 
+
+    const deUint32 Q = (Q56 << 5) | (Q34 << 3) | (Q012 << 0);
+
+    static const deUint32 quintsFromQ[256][3] =
+    {
+        { 0,0,0 }, { 1,0,0 }, { 2,0,0 }, { 3,0,0 }, { 4,0,0 }, { 0,4,0 }, { 4,4,0 }, { 4,4,4 }, { 0,1,0 }, { 1,1,0 }, { 2,1,0 }, { 3,1,0 }, { 4,1,0 }, { 1,4,0 }, { 4,4,1 }, { 4,4,4 },
+        { 0,2,0 }, { 1,2,0 }, { 2,2,0 }, { 3,2,0 }, { 4,2,0 }, { 2,4,0 }, { 4,4,2 }, { 4,4,4 }, { 0,3,0 }, { 1,3,0 }, { 2,3,0 }, { 3,3,0 }, { 4,3,0 }, { 3,4,0 }, { 4,4,3 }, { 4,4,4 },
+        { 0,0,1 }, { 1,0,1 }, { 2,0,1 }, { 3,0,1 }, { 4,0,1 }, { 0,4,1 }, { 4,0,4 }, { 0,4,4 }, { 0,1,1 }, { 1,1,1 }, { 2,1,1 }, { 3,1,1 }, { 4,1,1 }, { 1,4,1 }, { 4,1,4 }, { 1,4,4 },
+        { 0,2,1 }, { 1,2,1 }, { 2,2,1 }, { 3,2,1 }, { 4,2,1 }, { 2,4,1 }, { 4,2,4 }, { 2,4,4 }, { 0,3,1 }, { 1,3,1 }, { 2,3,1 }, { 3,3,1 }, { 4,3,1 }, { 3,4,1 }, { 4,3,4 }, { 3,4,4 },
+        { 0,0,2 }, { 1,0,2 }, { 2,0,2 }, { 3,0,2 }, { 4,0,2 }, { 0,4,2 }, { 2,0,4 }, { 3,0,4 }, { 0,1,2 }, { 1,1,2 }, { 2,1,2 }, { 3,1,2 }, { 4,1,2 }, { 1,4,2 }, { 2,1,4 }, { 3,1,4 },
+        { 0,2,2 }, { 1,2,2 }, { 2,2,2 }, { 3,2,2 }, { 4,2,2 }, { 2,4,2 }, { 2,2,4 }, { 3,2,4 }, { 0,3,2 }, { 1,3,2 }, { 2,3,2 }, { 3,3,2 }, { 4,3,2 }, { 3,4,2 }, { 2,3,4 }, { 3,3,4 },
+        { 0,0,3 }, { 1,0,3 }, { 2,0,3 }, { 3,0,3 }, { 4,0,3 }, { 0,4,3 }, { 0,0,4 }, { 1,0,4 }, { 0,1,3 }, { 1,1,3 }, { 2,1,3 }, { 3,1,3 }, { 4,1,3 }, { 1,4,3 }, { 0,1,4 }, { 1,1,4 },
+        { 0,2,3 }, { 1,2,3 }, { 2,2,3 }, { 3,2,3 }, { 4,2,3 }, { 2,4,3 }, { 0,2,4 }, { 1,2,4 }, { 0,3,3 }, { 1,3,3 }, { 2,3,3 }, { 3,3,3 }, { 4,3,3 }, { 3,4,3 }, { 0,3,4 }, { 1,3,4 }
+    };
+
+    const deUint32 (& quints)[3] = quintsFromQ[Q];
+    for (int i = 0; i < numValues; i++)
+    {
+        dst[i].m    = m[i];
+        dst[i].tq   = quints[i];
+        dst[i].v    = (quints[i] << numBits) + m[i];
+    }
+}
+
+inline void decodeISEBitBlock (ISEDecodedResult* dst, BitAccessStream& data, int numBits)
+{
+    dst[0].m = data.getNext(numBits);
+    dst[0].v = dst[0].m;
+}
+
+void decodeISE (ISEDecodedResult* dst, int numValues, BitAccessStream& data, const ISEParams& params)
+{
+    if (params.mode == ISEMODE_TRIT)
+    {
+        const int numBlocks = deDivRoundUp32(numValues, 5);
+        for (int blockNdx = 0; blockNdx < numBlocks; blockNdx++)
+        {
+            const int numValuesInBlock = blockNdx == numBlocks-1 ? numValues - 5*(numBlocks-1) : 5;
+            decodeISETritBlock(&dst[5*blockNdx], numValuesInBlock, data, params.numBits);
+        }
+    }
+    else if (params.mode == ISEMODE_QUINT)
+    {
+        const int numBlocks = deDivRoundUp32(numValues, 3);
+        for (int blockNdx = 0; blockNdx < numBlocks; blockNdx++)
+        {
+            const int numValuesInBlock = blockNdx == numBlocks-1 ? numValues - 3*(numBlocks-1) : 3;
+            decodeISEQuintBlock(&dst[3*blockNdx], numValuesInBlock, data, params.numBits);
+        }
+    }
+    else
+    {
+        DE_ASSERT(params.mode == ISEMODE_PLAIN_BIT);
+        for (int i = 0; i < numValues; i++)
+            decodeISEBitBlock(&dst[i], data, params.numBits);
+    }
+}
+
+void unquantizeColorEndpoints (deUint32* dst, const ISEDecodedResult* iseResults, int numEndpoints, const ISEParams& iseParams)
+{
+    if ((iseParams.mode == ISEMODE_TRIT) || (iseParams.mode == ISEMODE_QUINT))
+    {
+        const int rangeCase             = iseParams.numBits*2 - (iseParams.mode == ISEMODE_TRIT ? 2 : 1);
+        DE_ASSERT(basisu_astc::inRange(rangeCase, 0, 10));
+
+        static const deUint32   Ca[11]  = { 204, 113, 93, 54, 44, 26, 22, 13, 11, 6, 5 };
+        const deUint32          C       = Ca[rangeCase];
+
+        for (int endpointNdx = 0; endpointNdx < numEndpoints; endpointNdx++)
+        {
+            const deUint32 a = getBit(iseResults[endpointNdx].m, 0);
+            const deUint32 b = getBit(iseResults[endpointNdx].m, 1);
+            const deUint32 c = getBit(iseResults[endpointNdx].m, 2);
+            const deUint32 d = getBit(iseResults[endpointNdx].m, 3);
+            const deUint32 e = getBit(iseResults[endpointNdx].m, 4);
+            const deUint32 f = getBit(iseResults[endpointNdx].m, 5);
+            const deUint32 A = (a == 0) ? 0 : (1<<9)-1;
+
+            const deUint32 B = (rangeCase == 0)   ? 0
+                             : (rangeCase == 1)   ? 0
+                             : (rangeCase == 2)   ? ((b << 8) | (b << 4) | (b << 2) | (b << 1))
+                             : (rangeCase == 3)   ? ((b << 8) | (b << 3) | (b << 2))
+                             : (rangeCase == 4)   ? ((c << 8) | (b << 7) | (c << 3) | (b << 2) | (c << 1) | (b << 0))
+                             : (rangeCase == 5)   ? ((c << 8) | (b << 7) | (c << 2) | (b << 1) | (c << 0))
+                             : (rangeCase == 6)   ? ((d << 8) | (c << 7) | (b << 6) | (d << 2) | (c << 1) | (b << 0))
+                             : (rangeCase == 7)   ? ((d << 8) | (c << 7) | (b << 6) | (d << 1) | (c << 0))
+                             : (rangeCase == 8)   ? ((e << 8) | (d << 7) | (c << 6) | (b << 5) | (e << 1) | (d << 0))
+                             : (rangeCase == 9)   ? ((e << 8) | (d << 7) | (c << 6) | (b << 5) | (e << 0))
+                             : (rangeCase == 10)  ? ((f << 8) | (e << 7) | (d << 6) | (c << 5) | (b << 4) | (f << 0))
+                             : (deUint32)-1;
+
+            DE_ASSERT(B != (deUint32)-1);
+            dst[endpointNdx] = (((iseResults[endpointNdx].tq*C + B) ^ A) >> 2) | (A & 0x80);
+        }
+    }
+    else
+    {
+        DE_ASSERT(iseParams.mode == ISEMODE_PLAIN_BIT);
+        for (int endpointNdx = 0; endpointNdx < numEndpoints; endpointNdx++)
+            dst[endpointNdx] = bitReplicationScale(iseResults[endpointNdx].v, iseParams.numBits, 8);
+    }
+}
+
+inline void bitTransferSigned (deInt32& a, deInt32& b)
+{
+    b >>= 1;
+    b |= a & 0x80;
+    a >>= 1;
+    a &= 0x3f;
+    if (isBitSet(a, 5))
+        a -= 0x40;
+}
+
+inline UVec4 clampedRGBA (const IVec4& rgba)
+{
+    return UVec4(basisu_astc::clamp(rgba.x(), 0, 0xff),
+        basisu_astc::clamp(rgba.y(), 0, 0xff),
+        basisu_astc::clamp(rgba.z(), 0, 0xff),
+        basisu_astc::clamp(rgba.w(), 0, 0xff));
+}
+
+inline IVec4 blueContract (int r, int g, int b, int a)
+{
+    return IVec4((r+b)>>1, (g+b)>>1, b, a);
+}
+
+inline bool isColorEndpointModeHDR (deUint32 mode)
+{
+    return (mode == 2)    ||
+           (mode == 3)    ||
+           (mode == 7)    ||
+           (mode == 11)   ||
+           (mode == 14)   ||
+           (mode == 15);
+}
+
+void decodeHDREndpointMode7 (UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3)
+{
+    const deUint32 m10      = getBit(v1, 7) | (getBit(v2, 7) << 1);
+    const deUint32 m23      = getBits(v0, 6, 7);
+
+    const deUint32 majComp  = (m10 != 3)  ? m10
+                            : (m23 != 3)  ? m23
+                            :             0;
+    
+    const deUint32 mode     = (m10 != 3)  ? m23
+                            : (m23 != 3)  ? 4
+                            :             5;
+
+    deInt32         red     = (deInt32)getBits(v0, 0, 5);
+    deInt32         green   = (deInt32)getBits(v1, 0, 4);
+    deInt32         blue    = (deInt32)getBits(v2, 0, 4);
+    deInt32         scale   = (deInt32)getBits(v3, 0, 4);
+
+    {
+#define SHOR(DST_VAR, SHIFT, BIT_VAR) (DST_VAR) |= (BIT_VAR) << (SHIFT)
+#define ASSIGN_X_BITS(V0,S0, V1,S1, V2,S2, V3,S3, V4,S4, V5,S5, V6,S6) do { SHOR(V0,S0,x0); SHOR(V1,S1,x1); SHOR(V2,S2,x2); SHOR(V3,S3,x3); SHOR(V4,S4,x4); SHOR(V5,S5,x5); SHOR(V6,S6,x6); } while (false)
+
+        const deUint32  x0  = getBit(v1, 6);
+        const deUint32  x1  = getBit(v1, 5);
+        const deUint32  x2  = getBit(v2, 6);
+        const deUint32  x3  = getBit(v2, 5);
+        const deUint32  x4  = getBit(v3, 7);
+        const deUint32  x5  = getBit(v3, 6);
+        const deUint32  x6  = getBit(v3, 5);
+
+        deInt32&        R   = red;
+        deInt32&        G   = green;
+        deInt32&        B   = blue;
+        deInt32&        S   = scale;
+
+        switch (mode)
+        {
+            case 0: ASSIGN_X_BITS(R,9,  R,8,  R,7,  R,10,  R,6,  S,6,   S,5); break;
+            case 1: ASSIGN_X_BITS(R,8,  G,5,  R,7,  B,5,   R,6,  R,10,  R,9); break;
+            case 2: ASSIGN_X_BITS(R,9,  R,8,  R,7,  R,6,   S,7,  S,6,   S,5); break;
+            case 3: ASSIGN_X_BITS(R,8,  G,5,  R,7,  B,5,   R,6,  S,6,   S,5); break;
+            case 4: ASSIGN_X_BITS(G,6,  G,5,  B,6,  B,5,   R,6,  R,7,   S,5); break;
+            case 5: ASSIGN_X_BITS(G,6,  G,5,  B,6,  B,5,   R,6,  S,6,   S,5); break;
+            default:
+                DE_ASSERT(false);
+        }
+#undef ASSIGN_X_BITS
+#undef SHOR
+    }
+
+    static const int shiftAmounts[] = { 1, 1, 2, 3, 4, 5 };
+    DE_ASSERT(mode < DE_LENGTH_OF_ARRAY(shiftAmounts));
+
+    red     <<= shiftAmounts[mode];
+    green   <<= shiftAmounts[mode];
+    blue    <<= shiftAmounts[mode];
+    scale   <<= shiftAmounts[mode];
+
+    if (mode != 5)
+    {
+        green   = red - green;
+        blue    = red - blue;
+    }
+
+    if (majComp == 1)
+        std::swap(red, green);
+    else if (majComp == 2)
+        std::swap(red, blue);
+
+    e0 = UVec4(basisu_astc::clamp(red   - scale,    0, 0xfff),
+        basisu_astc::clamp(green    - scale,    0, 0xfff),
+        basisu_astc::clamp(blue - scale,    0, 0xfff),
+               0x780);
+
+    e1 = UVec4(basisu_astc::clamp(red,              0, 0xfff),
+        basisu_astc::clamp(green,               0, 0xfff),
+        basisu_astc::clamp(blue,                0, 0xfff),
+               0x780);
+}
+
+void decodeHDREndpointMode11 (UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3, deUint32 v4, deUint32 v5)
+{
+    const deUint32 major = (getBit(v5, 7) << 1) | getBit(v4, 7);
+
+    if (major == 3)
+    {
+        e0 = UVec4(v0<<4, v2<<4, getBits(v4,0,6)<<5, 0x780);
+        e1 = UVec4(v1<<4, v3<<4, getBits(v5,0,6)<<5, 0x780);
+    }
+    else
+    {
+        const deUint32 mode = (getBit(v3, 7) << 2) | (getBit(v2, 7) << 1) | getBit(v1, 7);
+
+        deInt32 a   = (deInt32)((getBit(v1, 6) << 8) | v0);
+        deInt32 c   = (deInt32)(getBits(v1, 0, 5));
+        deInt32 b0  = (deInt32)(getBits(v2, 0, 5));
+        deInt32 b1  = (deInt32)(getBits(v3, 0, 5));
+        deInt32 d0  = (deInt32)(getBits(v4, 0, 4));
+        deInt32 d1  = (deInt32)(getBits(v5, 0, 4));
+
+        {
+#define SHOR(DST_VAR, SHIFT, BIT_VAR) (DST_VAR) |= (BIT_VAR) << (SHIFT)
+#define ASSIGN_X_BITS(V0,S0, V1,S1, V2,S2, V3,S3, V4,S4, V5,S5) do { SHOR(V0,S0,x0); SHOR(V1,S1,x1); SHOR(V2,S2,x2); SHOR(V3,S3,x3); SHOR(V4,S4,x4); SHOR(V5,S5,x5); } while (false)
+            const deUint32 x0 = getBit(v2, 6);
+            const deUint32 x1 = getBit(v3, 6);
+            const deUint32 x2 = getBit(v4, 6);
+            const deUint32 x3 = getBit(v5, 6);
+            const deUint32 x4 = getBit(v4, 5);
+            const deUint32 x5 = getBit(v5, 5);
+
+            switch (mode)
+            {
+                case 0: ASSIGN_X_BITS(b0,6,  b1,6,   d0,6,  d1,6,  d0,5,  d1,5); break;
+                case 1: ASSIGN_X_BITS(b0,6,  b1,6,   b0,7,  b1,7,  d0,5,  d1,5); break;
+                case 2: ASSIGN_X_BITS(a,9,   c,6,    d0,6,  d1,6,  d0,5,  d1,5); break;
+                case 3: ASSIGN_X_BITS(b0,6,  b1,6,   a,9,   c,6,   d0,5,  d1,5); break;
+                case 4: ASSIGN_X_BITS(b0,6,  b1,6,   b0,7,  b1,7,  a,9,   a,10); break;
+                case 5: ASSIGN_X_BITS(a,9,   a,10,   c,7,   c,6,   d0,5,  d1,5); break;
+                case 6: ASSIGN_X_BITS(b0,6,  b1,6,   a,11,  c,6,   a,9,   a,10); break;
+                case 7: ASSIGN_X_BITS(a,9,   a,10,   a,11,  c,6,   d0,5,  d1,5); break;
+                default:
+                    DE_ASSERT(false);
+            }
+#undef ASSIGN_X_BITS
+#undef SHOR
+        }
+
+        static const int numDBits[] = { 7, 6, 7, 6, 5, 6, 5, 6 };
+        DE_ASSERT(mode < DE_LENGTH_OF_ARRAY(numDBits));
+        d0 = signExtend(d0, numDBits[mode]);
+        d1 = signExtend(d1, numDBits[mode]);
+        
+        const int shiftAmount = (mode >> 1) ^ 3;
+        a   = (uint32_t)a  << shiftAmount;
+        c   = (uint32_t)c  << shiftAmount;
+        b0  = (uint32_t)b0 << shiftAmount;
+        b1  = (uint32_t)b1 << shiftAmount;
+        d0  = (uint32_t)d0 << shiftAmount;
+        d1  = (uint32_t)d1 << shiftAmount;
+
+        e0 = UVec4(basisu_astc::clamp(a-c, 0, 0xfff), basisu_astc::clamp(a-b0-c-d0, 0, 0xfff), basisu_astc::clamp(a-b1-c-d1, 0, 0xfff), 0x780);
+        e1 = UVec4(basisu_astc::clamp(a, 0, 0xfff), basisu_astc::clamp(a-b0, 0, 0xfff), basisu_astc::clamp(a-b1, 0, 0xfff), 0x780);
+
+        if (major == 1)
+        {
+            std::swap(e0.x(), e0.y());
+            std::swap(e1.x(), e1.y());
+        }
+        else if (major == 2)
+        {
+            std::swap(e0.x(), e0.z());
+            std::swap(e1.x(), e1.z());
+        }
+    }
+}
+
+void decodeHDREndpointMode15(UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3, deUint32 v4, deUint32 v5, deUint32 v6In, deUint32 v7In)
+{
+    decodeHDREndpointMode11(e0, e1, v0, v1, v2, v3, v4, v5);
+    
+    const deUint32  mode    = (getBit(v7In, 7) << 1) | getBit(v6In, 7);
+    deInt32         v6      = (deInt32)getBits(v6In, 0, 6);
+    deInt32         v7      = (deInt32)getBits(v7In, 0, 6);
+
+    if (mode == 3)
+    {
+        e0.w() = v6 << 5;
+        e1.w() = v7 << 5;
+    }
+    else
+    {
+        v6 |= (v7 << (mode+1)) & 0x780;
+        v7 &= (0x3f >> mode);
+        v7 ^= 0x20 >> mode;
+        v7 -= 0x20 >> mode;
+        v6 <<= 4-mode;
+        v7 <<= 4-mode;
+        v7 += v6;
+        v7 = basisu_astc::clamp(v7, 0, 0xfff);
+        e0.w() = v6;
+        e1.w() = v7;
+    }
+}
+
+void decodeColorEndpoints (ColorEndpointPair* dst, const deUint32* unquantizedEndpoints, const deUint32* endpointModes, int numPartitions)
+{
+    int unquantizedNdx = 0;
+
+    for (int partitionNdx = 0; partitionNdx < numPartitions; partitionNdx++)
+    {
+        const deUint32      endpointMode    = endpointModes[partitionNdx];
+        const deUint32*     v               = &unquantizedEndpoints[unquantizedNdx];
+
+        UVec4&              e0              = dst[partitionNdx].e0;
+        UVec4&              e1              = dst[partitionNdx].e1;
+        unquantizedNdx += computeNumColorEndpointValues(endpointMode);
+
+        switch (endpointMode)
+        {
+            case 0:
+            {
+                e0 = UVec4(v[0], v[0], v[0], 0xff);
+                e1 = UVec4(v[1], v[1], v[1], 0xff);
+                break;
+            }
+            case 1:
+            {
+                const deUint32 L0 = (v[0] >> 2) | (getBits(v[1], 6, 7) << 6);
+                const deUint32 L1 = basisu_astc::min(0xffu, L0 + getBits(v[1], 0, 5));
+                e0 = UVec4(L0, L0, L0, 0xff);
+                e1 = UVec4(L1, L1, L1, 0xff);
+                break;
+            }
+            case 2:
+            {
+                const deUint32 v1Gr     = v[1] >= v[0];
+                const deUint32 y0       = v1Gr ? v[0]<<4 : (v[1]<<4) + 8;
+                const deUint32 y1       = v1Gr ? v[1]<<4 : (v[0]<<4) - 8;
+                e0 = UVec4(y0, y0, y0, 0x780);
+                e1 = UVec4(y1, y1, y1, 0x780);
+                break;
+            }
+            case 3:
+            {
+                const bool      m   = isBitSet(v[0], 7);
+                const deUint32  y0  = m ? (getBits(v[1], 5, 7) << 9) | (getBits(v[0], 0, 6) << 2)
+                                        : (getBits(v[1], 4, 7) << 8) | (getBits(v[0], 0, 6) << 1);
+                const deUint32  d   = m ? getBits(v[1], 0, 4) << 2
+                                        : getBits(v[1], 0, 3) << 1;
+                const deUint32  y1  = basisu_astc::min(0xfffu, y0+d);
+                e0 = UVec4(y0, y0, y0, 0x780);
+                e1 = UVec4(y1, y1, y1, 0x780);
+                break;
+            }
+            case 4:
+            {
+                e0 = UVec4(v[0], v[0], v[0], v[2]);
+                e1 = UVec4(v[1], v[1], v[1], v[3]);
+                break;
+            }
+            case 5:
+            {
+                deInt32 v0 = (deInt32)v[0];
+                deInt32 v1 = (deInt32)v[1];
+                deInt32 v2 = (deInt32)v[2];
+                deInt32 v3 = (deInt32)v[3];
+                bitTransferSigned(v1, v0);
+                bitTransferSigned(v3, v2);
+                e0 = clampedRGBA(IVec4(v0,      v0,     v0,     v2));
+                e1 = clampedRGBA(IVec4(v0+v1,   v0+v1,  v0+v1,  v2+v3));
+                break;
+            }
+            case 6:
+                e0 = UVec4((v[0]*v[3]) >> 8,    (v[1]*v[3]) >> 8,   (v[2]*v[3]) >> 8,   0xff);
+                e1 = UVec4(v[0],                v[1],               v[2],               0xff);
+                break;
+            case 7:
+                decodeHDREndpointMode7(e0, e1, v[0], v[1], v[2], v[3]);
+                break;
+            case 8:
+            {
+                if (v[1]+v[3]+v[5] >= v[0]+v[2]+v[4])
+                {
+                    e0 = UVec4(v[0], v[2], v[4], 0xff);
+                    e1 = UVec4(v[1], v[3], v[5], 0xff);
+                }
+                else
+                {
+                    e0 = blueContract(v[1], v[3], v[5], 0xff).asUint();
+                    e1 = blueContract(v[0], v[2], v[4], 0xff).asUint();
+                }
+                break;
+            }
+            case 9:
+            {
+                deInt32 v0 = (deInt32)v[0];
+                deInt32 v1 = (deInt32)v[1];
+                deInt32 v2 = (deInt32)v[2];
+                deInt32 v3 = (deInt32)v[3];
+                deInt32 v4 = (deInt32)v[4];
+                deInt32 v5 = (deInt32)v[5];
+                bitTransferSigned(v1, v0);
+                bitTransferSigned(v3, v2);
+                bitTransferSigned(v5, v4);
+                if (v1+v3+v5 >= 0)
+                {
+                    e0 = clampedRGBA(IVec4(v0,      v2,     v4,     0xff));
+                    e1 = clampedRGBA(IVec4(v0+v1,   v2+v3,  v4+v5,  0xff));
+                }
+                else
+                {
+                    e0 = clampedRGBA(blueContract(v0+v1,    v2+v3,  v4+v5,  0xff));
+                    e1 = clampedRGBA(blueContract(v0,       v2,     v4,     0xff));
+                }
+                break;
+            }
+            case 10:
+            {
+                e0 = UVec4((v[0]*v[3]) >> 8,    (v[1]*v[3]) >> 8,   (v[2]*v[3]) >> 8,   v[4]);
+                e1 = UVec4(v[0],                v[1],               v[2],               v[5]);
+                break;
+            }
+            case 11:
+            {
+                decodeHDREndpointMode11(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5]);
+                break;
+            }
+            case 12:
+            {
+                if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4])
+                {
+                    e0 = UVec4(v[0], v[2], v[4], v[6]);
+                    e1 = UVec4(v[1], v[3], v[5], v[7]);
+                }
+                else
+                {
+                    e0 = clampedRGBA(blueContract(v[1], v[3], v[5], v[7]));
+                    e1 = clampedRGBA(blueContract(v[0], v[2], v[4], v[6]));
+                }
+                break;
+            }
+            case 13:
+            {
+                deInt32 v0 = (deInt32)v[0];
+                deInt32 v1 = (deInt32)v[1];
+                deInt32 v2 = (deInt32)v[2];
+                deInt32 v3 = (deInt32)v[3];
+                deInt32 v4 = (deInt32)v[4];
+                deInt32 v5 = (deInt32)v[5];
+                deInt32 v6 = (deInt32)v[6];
+                deInt32 v7 = (deInt32)v[7];
+                bitTransferSigned(v1, v0);
+                bitTransferSigned(v3, v2);
+                bitTransferSigned(v5, v4);
+                bitTransferSigned(v7, v6);
+                if (v1+v3+v5 >= 0)
+                {
+                    e0 = clampedRGBA(IVec4(v0,      v2,     v4,     v6));
+                    e1 = clampedRGBA(IVec4(v0+v1,   v2+v3,  v4+v5,  v6+v7));
+                }
+                else
+                {
+                    e0 = clampedRGBA(blueContract(v0+v1,    v2+v3,  v4+v5,  v6+v7));
+                    e1 = clampedRGBA(blueContract(v0,       v2,     v4,     v6));
+                }
+                break;
+            }
+            case 14:
+                decodeHDREndpointMode11(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5]);
+                e0.w() = v[6];
+                e1.w() = v[7];
+                break;
+            case 15:
+            {
+                decodeHDREndpointMode15(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+                break;
+            }
+            default:
+                DE_ASSERT(false);
+        }
+    }
+}
+
+void computeColorEndpoints (ColorEndpointPair* dst, const Block128& blockData, const deUint32* endpointModes, int numPartitions, int numColorEndpointValues, const ISEParams& iseParams, int numBitsAvailable)
+{
+    const int           colorEndpointDataStart = (numPartitions == 1) ? 17 : 29;
+    ISEDecodedResult    colorEndpointData[18];
+    
+    {
+        BitAccessStream dataStream(blockData, colorEndpointDataStart, numBitsAvailable, true);
+        decodeISE(&colorEndpointData[0], numColorEndpointValues, dataStream, iseParams);
+    }
+
+    {
+        deUint32 unquantizedEndpoints[18];
+        unquantizeColorEndpoints(&unquantizedEndpoints[0], &colorEndpointData[0], numColorEndpointValues, iseParams);
+        decodeColorEndpoints(dst, &unquantizedEndpoints[0], &endpointModes[0], numPartitions);
+    }
+}
+
+void unquantizeWeights (deUint32 dst[64], const ISEDecodedResult* weightGrid, const ASTCBlockMode& blockMode)
+{
+    const int           numWeights  = computeNumWeights(blockMode);
+    const ISEParams&    iseParams   = blockMode.weightISEParams;
+
+    if ((iseParams.mode == ISEMODE_TRIT) || (iseParams.mode == ISEMODE_QUINT))
+    {
+        const int rangeCase = iseParams.numBits*2 + (iseParams.mode == ISEMODE_QUINT ? 1 : 0);
+
+        if ((rangeCase == 0) || (rangeCase == 1))
+        {
+            static const deUint32 map0[3]   = { 0, 32, 63 };
+            static const deUint32 map1[5]   = { 0, 16, 32, 47, 63 };
+            const deUint32* const map = (rangeCase == 0) ? &map0[0] : &map1[0];
+
+            for (int i = 0; i < numWeights; i++)
+            {
+                DE_ASSERT(weightGrid[i].v < (rangeCase == 0 ? 3u : 5u));
+                dst[i] = map[weightGrid[i].v];
+            }
+        }
+        else
+        {
+            DE_ASSERT(rangeCase <= 6);
+            static const deUint32   Ca[5]   = { 50, 28, 23, 13, 11 };
+            const deUint32          C       = Ca[rangeCase-2];
+
+            for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+            {
+                const deUint32 a = getBit(weightGrid[weightNdx].m, 0);
+                const deUint32 b = getBit(weightGrid[weightNdx].m, 1);
+                const deUint32 c = getBit(weightGrid[weightNdx].m, 2);
+                
+                const deUint32 A = (a == 0) ? 0 : (1<<7)-1;
+                const deUint32 B = (rangeCase == 2) ? 0
+                                 : (rangeCase == 3) ? 0
+                                 : (rangeCase == 4) ? (b << 6) | (b << 2) | (b << 0)
+                                 : (rangeCase == 5) ? (b << 6) | (b << 1)
+                                 : (rangeCase == 6) ? (c << 6) | (b << 5) | (c << 1) |  (b << 0)
+                                 : (deUint32)-1;
+
+                dst[weightNdx] = (((weightGrid[weightNdx].tq*C + B) ^ A) >> 2) | (A & 0x20);
+            }
+        }
+    }
+    else
+    {
+        DE_ASSERT(iseParams.mode == ISEMODE_PLAIN_BIT);
+        for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+            dst[weightNdx] = bitReplicationScale(weightGrid[weightNdx].v, iseParams.numBits, 6);
+    }
+
+    for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+        dst[weightNdx] += dst[weightNdx] > 32 ? 1 : 0;
+
+    // Initialize nonexistent weights to poison values
+    for (int weightNdx = numWeights; weightNdx < 64; weightNdx++)
+        dst[weightNdx] = ~0u;
+}
+
+void interpolateWeights (TexelWeightPair* dst, const deUint32 (&unquantizedWeights) [64], int blockWidth, int blockHeight, const ASTCBlockMode& blockMode)
+{
+    const int       numWeightsPerTexel  = blockMode.isDualPlane ? 2 : 1;
+    const deUint32  scaleX              = (1024 + blockWidth/2) / (blockWidth-1);
+    const deUint32  scaleY              = (1024 + blockHeight/2) / (blockHeight-1);
+    DE_ASSERT(blockMode.weightGridWidth*blockMode.weightGridHeight*numWeightsPerTexel <= (int)DE_LENGTH_OF_ARRAY(unquantizedWeights));
+
+    for (int texelY = 0; texelY < blockHeight; texelY++)
+    {
+        for (int texelX = 0; texelX < blockWidth; texelX++)
+        {
+            const deUint32 gX   = (scaleX*texelX*(blockMode.weightGridWidth-1) + 32) >> 6;
+            const deUint32 gY   = (scaleY*texelY*(blockMode.weightGridHeight-1) + 32) >> 6;
+            const deUint32 jX   = gX >> 4;
+            const deUint32 jY   = gY >> 4;
+            const deUint32 fX   = gX & 0xf;
+            const deUint32 fY   = gY & 0xf;
+            const deUint32 w11  = (fX*fY + 8) >> 4;
+            const deUint32 w10  = fY - w11;
+            const deUint32 w01  = fX - w11;
+            const deUint32 w00  = 16 - fX - fY + w11;
+            const deUint32 i00  = jY*blockMode.weightGridWidth + jX;
+            const deUint32 i01  = i00 + 1;
+            const deUint32 i10  = i00 + blockMode.weightGridWidth;
+            const deUint32 i11  = i00 + blockMode.weightGridWidth + 1;
+            
+            // These addresses can be out of bounds, but respective weights will be 0 then.
+            DE_ASSERT(deInBounds32(i00, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w00 == 0);
+            DE_ASSERT(deInBounds32(i01, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w01 == 0);
+            DE_ASSERT(deInBounds32(i10, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w10 == 0);
+            DE_ASSERT(deInBounds32(i11, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w11 == 0);
+
+            for (int texelWeightNdx = 0; texelWeightNdx < numWeightsPerTexel; texelWeightNdx++)
+            {
+                // & 0x3f clamps address to bounds of unquantizedWeights
+                const deUint32 p00  = unquantizedWeights[(i00 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+                const deUint32 p01  = unquantizedWeights[(i01 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+                const deUint32 p10  = unquantizedWeights[(i10 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+                const deUint32 p11  = unquantizedWeights[(i11 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+                                
+                dst[texelY*blockWidth + texelX].w[texelWeightNdx] = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
+            }
+        }
+    }
+}
+
+void computeTexelWeights (TexelWeightPair* dst, const Block128& blockData, int blockWidth, int blockHeight, const ASTCBlockMode& blockMode)
+{
+    ISEDecodedResult weightGrid[64];
+
+    {
+        BitAccessStream dataStream(blockData, 127, computeNumRequiredBits(blockMode.weightISEParams, computeNumWeights(blockMode)), false);
+        decodeISE(&weightGrid[0], computeNumWeights(blockMode), dataStream, blockMode.weightISEParams);
+    }
+
+    {
+        deUint32 unquantizedWeights[64];
+        unquantizeWeights(&unquantizedWeights[0], &weightGrid[0], blockMode);
+
+        interpolateWeights(dst, unquantizedWeights, blockWidth, blockHeight, blockMode);
+    }
+}
+
+inline deUint32 hash52 (deUint32 v)
+{
+    deUint32 p = v;
+    p ^= p >> 15;   p -= p << 17;   p += p << 7;    p += p << 4;
+    p ^= p >>  5;   p += p << 16;   p ^= p >> 7;    p ^= p >> 3;
+    p ^= p <<  6;   p ^= p >> 17;
+    return p;
+}
+
+int computeTexelPartition (deUint32 seedIn, deUint32 xIn, deUint32 yIn, deUint32 zIn, int numPartitions, bool smallBlock)
+{
+    DE_ASSERT(zIn == 0);
+
+    const deUint32  x       = smallBlock ? xIn << 1 : xIn;
+    const deUint32  y       = smallBlock ? yIn << 1 : yIn;
+    const deUint32  z       = smallBlock ? zIn << 1 : zIn;
+    const deUint32  seed    = seedIn + 1024*(numPartitions-1);
+    const deUint32  rnum    = hash52(seed);
+
+    deUint8         seed1   = (deUint8)( rnum                           & 0xf);
+    deUint8         seed2   = (deUint8)((rnum >>  4)                    & 0xf);
+    deUint8         seed3   = (deUint8)((rnum >>  8)                    & 0xf);
+    deUint8         seed4   = (deUint8)((rnum >> 12)                    & 0xf);
+    deUint8         seed5   = (deUint8)((rnum >> 16)                    & 0xf);
+    deUint8         seed6   = (deUint8)((rnum >> 20)                    & 0xf);
+    deUint8         seed7   = (deUint8)((rnum >> 24)                    & 0xf);
+    deUint8         seed8   = (deUint8)((rnum >> 28)                    & 0xf);
+    deUint8         seed9   = (deUint8)((rnum >> 18)                    & 0xf);
+    deUint8         seed10  = (deUint8)((rnum >> 22)                    & 0xf);
+    deUint8         seed11  = (deUint8)((rnum >> 26)                    & 0xf);
+    deUint8         seed12  = (deUint8)(((rnum >> 30) | (rnum << 2))    & 0xf);
+
+    seed1  = (deUint8)(seed1  * seed1 );
+    seed2  = (deUint8)(seed2  * seed2 );
+    seed3  = (deUint8)(seed3  * seed3 );
+    seed4  = (deUint8)(seed4  * seed4 );
+    seed5  = (deUint8)(seed5  * seed5 );
+    seed6  = (deUint8)(seed6  * seed6 );
+    seed7  = (deUint8)(seed7  * seed7 );
+    seed8  = (deUint8)(seed8  * seed8 );
+    seed9  = (deUint8)(seed9  * seed9 );
+    seed10 = (deUint8)(seed10 * seed10);
+    seed11 = (deUint8)(seed11 * seed11);
+    seed12 = (deUint8)(seed12 * seed12);
+
+    const int shA = (seed & 2) != 0     ? 4     : 5;
+    const int shB = numPartitions == 3  ? 6     : 5;
+    const int sh1 = (seed & 1) != 0     ? shA   : shB;
+    const int sh2 = (seed & 1) != 0     ? shB   : shA;
+    const int sh3 = (seed & 0x10) != 0  ? sh1   : sh2;
+
+    seed1  = (deUint8)(seed1  >> sh1);
+    seed2  = (deUint8)(seed2  >> sh2);
+    seed3  = (deUint8)(seed3  >> sh1);
+    seed4  = (deUint8)(seed4  >> sh2);
+    seed5  = (deUint8)(seed5  >> sh1);
+    seed6  = (deUint8)(seed6  >> sh2);
+    seed7  = (deUint8)(seed7  >> sh1);
+    seed8  = (deUint8)(seed8  >> sh2);
+    seed9  = (deUint8)(seed9  >> sh3);
+    seed10 = (deUint8)(seed10 >> sh3);
+    seed11 = (deUint8)(seed11 >> sh3);
+    seed12 = (deUint8)(seed12 >> sh3);
+
+    const int a =                         0x3f & (seed1*x + seed2*y + seed11*z + (rnum >> 14));
+    const int b =                         0x3f & (seed3*x + seed4*y + seed12*z + (rnum >> 10));
+    const int c = (numPartitions >= 3) ?  0x3f & (seed5*x + seed6*y + seed9*z  + (rnum >>  6))    : 0;
+    const int d = (numPartitions >= 4) ?  0x3f & (seed7*x + seed8*y + seed10*z + (rnum >>  2))    : 0;
+
+    return (a >= b && a >= c && a >= d) ? 0
+         : (b >= c && b >= d)           ? 1
+         : (c >= d)                     ? 2
+         :                                3;
+}
+
+DecompressResult setTexelColors (void* dst, ColorEndpointPair* colorEndpoints, TexelWeightPair* texelWeights, int ccs, deUint32 partitionIndexSeed,
+                                 int numPartitions, int blockWidth, int blockHeight, bool isSRGB, bool isLDRMode, const deUint32* colorEndpointModes)
+{
+    const bool          smallBlock  = blockWidth*blockHeight < 31;
+    DecompressResult    result      = DECOMPRESS_RESULT_VALID_BLOCK;
+    bool                isHDREndpoint[4];
+
+    for (int i = 0; i < numPartitions; i++)
+    {
+        isHDREndpoint[i] = isColorEndpointModeHDR(colorEndpointModes[i]);
+    }
+
+    for (int texelY = 0; texelY < blockHeight; texelY++)
+    {
+        for (int texelX = 0; texelX < blockWidth; texelX++)
+        {
+            const int texelNdx = texelY * blockWidth + texelX;
+            const int colorEndpointNdx = (numPartitions == 1) ? 0 : computeTexelPartition(partitionIndexSeed, texelX, texelY, 0, numPartitions, smallBlock);
+
+            DE_ASSERT(colorEndpointNdx < numPartitions);
+            const UVec4& e0 = colorEndpoints[colorEndpointNdx].e0;
+            const UVec4& e1 = colorEndpoints[colorEndpointNdx].e1;
+            const TexelWeightPair& weight = texelWeights[texelNdx];
+
+            if (isLDRMode && isHDREndpoint[colorEndpointNdx])
+            {
+                if (isSRGB)
+                {
+                    ((deUint8*)dst)[texelNdx * 4 + 0] = 0xff;
+                    ((deUint8*)dst)[texelNdx * 4 + 1] = 0;
+                    ((deUint8*)dst)[texelNdx * 4 + 2] = 0xff;
+                    ((deUint8*)dst)[texelNdx * 4 + 3] = 0xff;
+                }
+                else
+                {
+                    ((float*)dst)[texelNdx * 4 + 0] = 1.0f;
+                    ((float*)dst)[texelNdx * 4 + 1] = 0;
+                    ((float*)dst)[texelNdx * 4 + 2] = 1.0f;
+                    ((float*)dst)[texelNdx * 4 + 3] = 1.0f;
+                }
+                result = DECOMPRESS_RESULT_ERROR;
+            }
+            else
+            {
+                for (int channelNdx = 0; channelNdx < 4; channelNdx++)
+                {
+                    if (!isHDREndpoint[colorEndpointNdx] || (channelNdx == 3 && colorEndpointModes[colorEndpointNdx] == 14)) // \note Alpha for mode 14 is treated the same as LDR.
+                    {
+                        const deUint32 c0 = (e0[channelNdx] << 8) | (isSRGB ? 0x80 : e0[channelNdx]);
+                        const deUint32 c1 = (e1[channelNdx] << 8) | (isSRGB ? 0x80 : e1[channelNdx]);
+                        const deUint32 w = weight.w[ccs == channelNdx ? 1 : 0];
+                        const deUint32 c = (c0 * (64 - w) + c1 * w + 32) / 64;
+
+                        if (isSRGB)
+                            ((deUint8*)dst)[texelNdx * 4 + channelNdx] = (deUint8)((c & 0xff00) >> 8);
+                        else
+                            ((float*)dst)[texelNdx * 4 + channelNdx] = (c == 65535) ? 1.0f : (float)c / 65536.0f;
+                    }
+                    else
+                    {
+                        DE_ASSERT(!isSRGB);
+                        //DE_STATIC_ASSERT((basisu_astc::meta::TypesSame<deFloat16, deUint16>::Value));
+
+                        const deUint32      c0 = e0[channelNdx] << 4;
+                        const deUint32      c1 = e1[channelNdx] << 4;
+                        const deUint32      w = weight.w[(ccs == channelNdx) ? 1 : 0];
+                        const deUint32      c = (c0 * (64 - w) + c1 * w + 32) / 64;
+                        const deUint32      e = getBits(c, 11, 15);
+                        const deUint32      m = getBits(c, 0, 10);
+                        const deUint32      mt = (m < 512) ? (3 * m)
+                            : (m >= 1536) ? (5 * m - 2048)
+                            : (4 * m - 512);
+
+                        const deFloat16     cf = (deFloat16)((e << 10) + (mt >> 3));
+
+                        ((float*)dst)[texelNdx * 4 + channelNdx] = deFloat16To32(isFloat16InfOrNan(cf) ? 0x7bff : cf);
+                    }
+                
+                } // channelNdx
+            }
+        } // texelX
+    } // texelY
+
+    return result;
+}
+
+DecompressResult decompressBlock (void* dst, const Block128& blockData, int blockWidth, int blockHeight, bool isSRGB, bool isLDR)
+{
+    DE_ASSERT(isLDR || !isSRGB);
+    
+    // Decode block mode.
+    const ASTCBlockMode blockMode = getASTCBlockMode(blockData.getBits(0, 10));
+    
+    // Check for block mode errors.
+    if (blockMode.isError)
+    {
+        setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+        return DECOMPRESS_RESULT_ERROR;
+    }
+    
+    // Separate path for void-extent.
+    if (blockMode.isVoidExtent)
+        return decodeVoidExtentBlock(dst, blockData, blockWidth, blockHeight, isSRGB, isLDR);
+    
+    // Compute weight grid values.
+    const int numWeights            = computeNumWeights(blockMode);
+    const int numWeightDataBits     = computeNumRequiredBits(blockMode.weightISEParams, numWeights);
+    const int numPartitions         = (int)blockData.getBits(11, 12) + 1;
+    
+    // Check for errors in weight grid, partition and dual-plane parameters.
+    if ((numWeights > 64)                               ||
+        (numWeightDataBits > 96)                        ||
+        (numWeightDataBits < 24)                        ||
+        (blockMode.weightGridWidth > blockWidth)        ||
+        (blockMode.weightGridHeight > blockHeight)      ||
+        ((numPartitions == 4) && blockMode.isDualPlane))
+    {
+        setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+        return DECOMPRESS_RESULT_ERROR;
+    }
+    
+    // Compute number of bits available for color endpoint data.
+    const bool  isSingleUniqueCem           = (numPartitions == 1) || (blockData.getBits(23, 24) == 0);
+
+    const int   numConfigDataBits           = ((numPartitions == 1) ? 17 : isSingleUniqueCem ? 29 : 25 + 3*numPartitions) +
+                                              (blockMode.isDualPlane ? 2 : 0);
+
+    const int   numBitsForColorEndpoints    = 128 - numWeightDataBits - numConfigDataBits;
+
+    const int   extraCemBitsStart           = 127 - numWeightDataBits - (isSingleUniqueCem      ? -1
+                                                                        : (numPartitions == 4)  ? 7
+                                                                        : (numPartitions == 3)  ? 4
+                                                                        : (numPartitions == 2)  ? 1
+                                                                        : 0);
+    
+    // Decode color endpoint modes.
+    deUint32 colorEndpointModes[4];
+    decodeColorEndpointModes(&colorEndpointModes[0], blockData, numPartitions, extraCemBitsStart);
+    const int numColorEndpointValues = computeNumColorEndpointValues(colorEndpointModes, numPartitions);
+    
+    // Check for errors in color endpoint value count.
+    if ((numColorEndpointValues > 18) || (numBitsForColorEndpoints < (int)deDivRoundUp32(13*numColorEndpointValues, 5)))
+    {
+        setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+        return DECOMPRESS_RESULT_ERROR;
+    }
+    
+    // Compute color endpoints.
+    ColorEndpointPair colorEndpoints[4];
+    computeColorEndpoints(&colorEndpoints[0], blockData, &colorEndpointModes[0], numPartitions, numColorEndpointValues,
+                          computeMaximumRangeISEParams(numBitsForColorEndpoints, numColorEndpointValues), numBitsForColorEndpoints);
+    
+    // Compute texel weights.
+    TexelWeightPair texelWeights[MAX_BLOCK_WIDTH*MAX_BLOCK_HEIGHT];
+    computeTexelWeights(&texelWeights[0], blockData, blockWidth, blockHeight, blockMode);
+    
+    // Set texel colors.
+    const int       ccs                     = blockMode.isDualPlane ? (int)blockData.getBits(extraCemBitsStart-2, extraCemBitsStart-1) : -1;
+    const deUint32  partitionIndexSeed      = (numPartitions > 1) ? blockData.getBits(13, 22) : (deUint32)-1;
+
+    return setTexelColors(dst, &colorEndpoints[0], &texelWeights[0], ccs, partitionIndexSeed, numPartitions, blockWidth, blockHeight, isSRGB, isLDR, &colorEndpointModes[0]);
+}
+
+// Returns -1 on error, 0 if LDR, 1 if HDR
+int isHDR(const Block128& blockData, int blockWidth, int blockHeight)
+{
+    // Decode block mode.
+    const ASTCBlockMode blockMode = getASTCBlockMode(blockData.getBits(0, 10));
+
+    // Check for block mode errors.
+    if (blockMode.isError)
+        return -1;
+
+    // Separate path for void-extent.
+    if (blockMode.isVoidExtent)
+    {
+        const bool isHDRBlock = blockData.isBitSet(9);
+        return isHDRBlock ? 1 : 0;
+    }
+
+    // Compute weight grid values.
+    const int numWeights = computeNumWeights(blockMode);
+    const int numWeightDataBits = computeNumRequiredBits(blockMode.weightISEParams, numWeights);
+    const int numPartitions = (int)blockData.getBits(11, 12) + 1;
+
+    // Check for errors in weight grid, partition and dual-plane parameters.
+    if ((numWeights > 64) ||
+        (numWeightDataBits > 96) ||
+        (numWeightDataBits < 24) ||
+        (blockMode.weightGridWidth > blockWidth) ||
+        (blockMode.weightGridHeight > blockHeight) ||
+        ((numPartitions == 4) && blockMode.isDualPlane))
+    {
+        return -1;
+    }
+
+    // Compute number of bits available for color endpoint data.
+    const bool  isSingleUniqueCem = (numPartitions == 1) || (blockData.getBits(23, 24) == 0);
+
+    const int   extraCemBitsStart = 127 - numWeightDataBits - (isSingleUniqueCem ? -1
+        : (numPartitions == 4) ? 7
+        : (numPartitions == 3) ? 4
+        : (numPartitions == 2) ? 1
+        : 0);
+
+    // Decode color endpoint modes.
+    deUint32 colorEndpointModes[4];
+    decodeColorEndpointModes(&colorEndpointModes[0], blockData, numPartitions, extraCemBitsStart);
+    
+    for (int i = 0; i < numPartitions; i++)
+    {
+        if (isColorEndpointModeHDR(colorEndpointModes[i]))
+            return 1;
+    }
+
+    return 0;
+}
+
+typedef uint16_t half_float;
+
+half_float float_to_half(float val, bool toward_zero)
+{
+    union { float f; int32_t i; uint32_t u; } fi = { val };
+    const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1;
+    int s = flt_s, e = 0, m = 0;
+
+    // inf/NaN
+    if (flt_e == 0xff)
+    {
+        e = 31;
+        if (flt_m != 0) // NaN
+            m = 1;
+    }
+    // not zero or denormal
+    else if (flt_e != 0)
+    {
+        int new_exp = flt_e - 127;
+        if (new_exp > 15)
+            e = 31;
+        else if (new_exp < -14)
+        {
+            if (toward_zero)
+                m = (int)truncf((1 << 24) * fabsf(fi.f));
+            else
+                m = lrintf((1 << 24) * fabsf(fi.f));
+        }
+        else
+        {
+            e = new_exp + 15;
+            if (toward_zero)
+                m = (int)truncf((float)flt_m * (1.0f / (float)(1 << 13)));
+            else
+                m = lrintf((float)flt_m * (1.0f / (float)(1 << 13)));
+        }
+    }
+
+    assert((0 <= m) && (m <= 1024));
+    if (m == 1024)
+    {
+        e++;
+        m = 0;
+    }
+
+    assert((s >= 0) && (s <= 1));
+    assert((e >= 0) && (e <= 31));
+    assert((m >= 0) && (m <= 1023));
+
+    half_float result = (half_float)((s << 15) | (e << 10) | m);
+    return result;
+}
+
+float half_to_float(half_float hval)
+{
+    union { float f; uint32_t u; } x = { 0 };
+
+    uint32_t s = ((uint32_t)hval >> 15) & 1;
+    uint32_t e = ((uint32_t)hval >> 10) & 0x1F;
+    uint32_t m = (uint32_t)hval & 0x3FF;
+
+    if (!e)
+    {
+        if (!m)
+        {
+            // +- 0
+            x.u = s << 31;
+            return x.f;
+        }
+        else
+        {
+            // denormalized
+            while (!(m & 0x00000400))
+            {
+                m <<= 1;
+                --e;
+            }
+
+            ++e;
+            m &= ~0x00000400;
+        }
+    }
+    else if (e == 31)
+    {
+        if (m == 0)
+        {
+            // +/- INF
+            x.u = (s << 31) | 0x7f800000;
+            return x.f;
+        }
+        else
+        {
+            // +/- NaN
+            x.u = (s << 31) | 0x7f800000 | (m << 13);
+            return x.f;
+        }
+    }
+
+    e = e + (127 - 15);
+    m = m << 13;
+
+    assert(s <= 1);
+    assert(m <= 0x7FFFFF);
+    assert(e <= 255);
+
+    x.u = m | (e << 23) | (s << 31);
+    return x.f;
+}
+
+} // anonymous
+
+// See https://registry.khronos.org/DataFormat/specs/1.3/dataformat.1.3.inline.html#_hdr_endpoint_decoding
+static void convert_to_half_prec(uint32_t n, float* pVals)
+{
+#if 0
+    const int prev_dir = fesetround(FE_TOWARDZERO);
+
+    for (uint32_t i = 0; i < n; i++)
+        pVals[i] = half_to_float(float_to_half(pVals[i]));
+
+    fesetround(prev_dir);
+
+    for (uint32_t i = 0; i < n; i++)
+    {
+        assert(pVals[i] == half_to_float(float_to_half(pVals[i], true)));
+    }
+#else
+    // This ensures the values are rounded towards zero as half floats.
+    for (uint32_t i = 0; i < n; i++)
+    {
+        pVals[i] = half_to_float(float_to_half(pVals[i], true));
+    }
+#endif
+}
+
+bool decompress_ldr(uint8_t *pDst, const uint8_t * data, bool isSRGB, int blockWidth, int blockHeight)
+{
+    float linear[MAX_BLOCK_WIDTH * MAX_BLOCK_HEIGHT * 4];
+
+    const Block128 blockData(data);
+    
+    // isSRGB is true, this writes uint8_t's. Otherwise it writes floats.
+    if (decompressBlock(isSRGB ? (void*)pDst : (void*)&linear[0], blockData, blockWidth, blockHeight, isSRGB, true) != DECOMPRESS_RESULT_VALID_BLOCK)
+    {
+        return false;
+    }
+
+    if (!isSRGB)
+    {
+        // Convert the floats to 8-bits with rounding.
+        int pix = 0;
+        for (int i = 0; i < blockHeight; i++)
+        {
+            for (int j = 0; j < blockWidth; j++, pix++)
+            {
+                pDst[4 * pix + 0] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 0] * 65536.0f + .5f), 0, 65535) >> 8);
+                pDst[4 * pix + 1] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 1] * 65536.0f + .5f), 0, 65535) >> 8);
+                pDst[4 * pix + 2] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 2] * 65536.0f + .5f), 0, 65535) >> 8);
+                pDst[4 * pix + 3] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 3] * 65536.0f + .5f), 0, 65535) >> 8);
+            }
+        }
+    }
+
+    return true;
+}
+
+bool decompress_hdr(float* pDstRGBA, const uint8_t* data, int blockWidth, int blockHeight)
+{
+    const Block128 blockData(data);
+
+    if (decompressBlock(pDstRGBA, blockData, blockWidth, blockHeight, false, false) != DECOMPRESS_RESULT_VALID_BLOCK)
+    {
+        return false;
+    }
+
+    convert_to_half_prec(blockWidth * blockHeight * 4, pDstRGBA);
+        
+    return true;
+}
+
+bool is_hdr(const uint8_t* data, int blockWidth, int blockHeight, bool &is_hdr)
+{
+    is_hdr = false;
+
+    const Block128 blockData(data);
+    
+    int status = isHDR(blockData, blockWidth, blockHeight);
+    if (status < 0)
+    {
+        return false;
+    }
+
+    is_hdr = (status == 1);
+
+    return true;
+}
+
+} // astc
+
+} // basisu_astc
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/encoder/3rdparty/android_astc_decomp.h b/encoder/3rdparty/android_astc_decomp.h
new file mode 100644
index 0000000..ad13093
--- /dev/null
+++ b/encoder/3rdparty/android_astc_decomp.h
@@ -0,0 +1,45 @@
+// File: android_astc_decomp.h
+#ifndef _TCUASTCUTIL_HPP
+#define _TCUASTCUTIL_HPP
+/*-------------------------------------------------------------------------
+ * drawElements Quality Program Tester Core
+ * ----------------------------------------
+ *
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *//*!
+ * \file
+ * \brief ASTC Utilities.
+ *//*--------------------------------------------------------------------*/
+
+#include <vector>
+#include <stdint.h>
+
+namespace basisu_astc
+{
+namespace astc
+{
+
+// Unpacks a single ASTC block to pDst
+// If isSRGB is true, the spec requires the decoder to scale the LDR 8-bit endpoints to 16-bit before interpolation slightly differently, 
+// which will lead to different outputs. So be sure to set it correctly (ideally it should match whatever the encoder did).
+bool decompress_ldr(uint8_t* pDst, const uint8_t* data, bool isSRGB, int blockWidth, int blockHeight);
+bool decompress_hdr(float* pDstRGBA, const uint8_t* data, int blockWidth, int blockHeight);
+bool is_hdr(const uint8_t* data, int blockWidth, int blockHeight, bool& is_hdr);
+
+} // astc
+} // basisu
+
+#endif
diff --git a/encoder/3rdparty/qoi.h b/encoder/3rdparty/qoi.h
new file mode 100644
index 0000000..be8a2d5
--- /dev/null
+++ b/encoder/3rdparty/qoi.h
@@ -0,0 +1,659 @@
+/*
+
+Copyright (c) 2021, Dominic Szablewski - https://phoboslab.org
+SPDX-License-Identifier: MIT
+
+
+QOI - The "Quite OK Image" format for fast, lossless image compression
+
+-- About
+
+QOI encodes and decodes images in a lossless format. Compared to stb_image and
+stb_image_write QOI offers 20x-50x faster encoding, 3x-4x faster decoding and
+20% better compression.
+
+
+-- Synopsis
+
+// Define `QOI_IMPLEMENTATION` in *one* C/C++ file before including this
+// library to create the implementation.
+
+#define QOI_IMPLEMENTATION
+#include "qoi.h"
+
+// Encode and store an RGBA buffer to the file system. The qoi_desc describes
+// the input pixel data.
+qoi_write("image_new.qoi", rgba_pixels, &(qoi_desc){
+	.width = 1920,
+	.height = 1080,
+	.channels = 4,
+	.colorspace = QOI_SRGB
+});
+
+// Load and decode a QOI image from the file system into a 32bbp RGBA buffer.
+// The qoi_desc struct will be filled with the width, height, number of channels
+// and colorspace read from the file header.
+qoi_desc desc;
+void *rgba_pixels = qoi_read("image.qoi", &desc, 4);
+
+
+
+-- Documentation
+
+This library provides the following functions;
+- qoi_read    -- read and decode a QOI file
+- qoi_decode  -- decode the raw bytes of a QOI image from memory
+- qoi_write   -- encode and write a QOI file
+- qoi_encode  -- encode an rgba buffer into a QOI image in memory
+
+See the function declaration below for the signature and more information.
+
+If you don't want/need the qoi_read and qoi_write functions, you can define
+QOI_NO_STDIO before including this library.
+
+This library uses malloc() and free(). To supply your own malloc implementation
+you can define QOI_MALLOC and QOI_FREE before including this library.
+
+This library uses memset() to zero-initialize the index. To supply your own
+implementation you can define QOI_ZEROARR before including this library.
+
+
+-- Data Format
+
+A QOI file has a 14 byte header, followed by any number of data "chunks" and an
+8-byte end marker.
+
+struct qoi_header_t {
+	char     magic[4];   // magic bytes "qoif"
+	uint32_t width;      // image width in pixels (BE)
+	uint32_t height;     // image height in pixels (BE)
+	uint8_t  channels;   // 3 = RGB, 4 = RGBA
+	uint8_t  colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear
+};
+
+Images are encoded row by row, left to right, top to bottom. The decoder and
+encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous pixel value. An
+image is complete when all pixels specified by width * height have been covered.
+
+Pixels are encoded as
+ - a run of the previous pixel
+ - an index into an array of previously seen pixels
+ - a difference to the previous pixel value in r,g,b
+ - full r,g,b or r,g,b,a values
+
+The color channels are assumed to not be premultiplied with the alpha channel
+("un-premultiplied alpha").
+
+A running array[64] (zero-initialized) of previously seen pixel values is
+maintained by the encoder and decoder. Each pixel that is seen by the encoder
+and decoder is put into this array at the position formed by a hash function of
+the color value. In the encoder, if the pixel value at the index matches the
+current pixel, this index position is written to the stream as QOI_OP_INDEX.
+The hash function for the index is:
+
+	index_position = (r * 3 + g * 5 + b * 7 + a * 11) % 64
+
+Each chunk starts with a 2- or 8-bit tag, followed by a number of data bits. The
+bit length of chunks is divisible by 8 - i.e. all chunks are byte aligned. All
+values encoded in these data bits have the most significant bit on the left.
+
+The 8-bit tags have precedence over the 2-bit tags. A decoder must check for the
+presence of an 8-bit tag first.
+
+The byte stream's end is marked with 7 0x00 bytes followed a single 0x01 byte.
+
+
+The possible chunks are:
+
+
+.- QOI_OP_INDEX ----------.
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----------------|
+|  0  0 |     index       |
+`-------------------------`
+2-bit tag b00
+6-bit index into the color index array: 0..63
+
+A valid encoder must not issue 2 or more consecutive QOI_OP_INDEX chunks to the
+same index. QOI_OP_RUN should be used instead.
+
+
+.- QOI_OP_DIFF -----------.
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----+-----+-----|
+|  0  1 |  dr |  dg |  db |
+`-------------------------`
+2-bit tag b01
+2-bit   red channel difference from the previous pixel between -2..1
+2-bit green channel difference from the previous pixel between -2..1
+2-bit  blue channel difference from the previous pixel between -2..1
+
+The difference to the current channel values are using a wraparound operation,
+so "1 - 2" will result in 255, while "255 + 1" will result in 0.
+
+Values are stored as unsigned integers with a bias of 2. E.g. -2 is stored as
+0 (b00). 1 is stored as 3 (b11).
+
+The alpha value remains unchanged from the previous pixel.
+
+
+.- QOI_OP_LUMA -------------------------------------.
+|         Byte[0]         |         Byte[1]         |
+|  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
+|-------+-----------------+-------------+-----------|
+|  1  0 |  green diff     |   dr - dg   |  db - dg  |
+`---------------------------------------------------`
+2-bit tag b10
+6-bit green channel difference from the previous pixel -32..31
+4-bit   red channel difference minus green channel difference -8..7
+4-bit  blue channel difference minus green channel difference -8..7
+
+The green channel is used to indicate the general direction of change and is
+encoded in 6 bits. The red and blue channels (dr and db) base their diffs off
+of the green channel difference and are encoded in 4 bits. I.e.:
+	dr_dg = (cur_px.r - prev_px.r) - (cur_px.g - prev_px.g)
+	db_dg = (cur_px.b - prev_px.b) - (cur_px.g - prev_px.g)
+
+The difference to the current channel values are using a wraparound operation,
+so "10 - 13" will result in 253, while "250 + 7" will result in 1.
+
+Values are stored as unsigned integers with a bias of 32 for the green channel
+and a bias of 8 for the red and blue channel.
+
+The alpha value remains unchanged from the previous pixel.
+
+
+.- QOI_OP_RUN ------------.
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----------------|
+|  1  1 |       run       |
+`-------------------------`
+2-bit tag b11
+6-bit run-length repeating the previous pixel: 1..62
+
+The run-length is stored with a bias of -1. Note that the run-lengths 63 and 64
+(b111110 and b111111) are illegal as they are occupied by the QOI_OP_RGB and
+QOI_OP_RGBA tags.
+
+
+.- QOI_OP_RGB ------------------------------------------.
+|         Byte[0]         | Byte[1] | Byte[2] | Byte[3] |
+|  7  6  5  4  3  2  1  0 | 7 .. 0  | 7 .. 0  | 7 .. 0  |
+|-------------------------+---------+---------+---------|
+|  1  1  1  1  1  1  1  0 |   red   |  green  |  blue   |
+`-------------------------------------------------------`
+8-bit tag b11111110
+8-bit   red channel value
+8-bit green channel value
+8-bit  blue channel value
+
+The alpha value remains unchanged from the previous pixel.
+
+
+.- QOI_OP_RGBA ---------------------------------------------------.
+|         Byte[0]         | Byte[1] | Byte[2] | Byte[3] | Byte[4] |
+|  7  6  5  4  3  2  1  0 | 7 .. 0  | 7 .. 0  | 7 .. 0  | 7 .. 0  |
+|-------------------------+---------+---------+---------+---------|
+|  1  1  1  1  1  1  1  1 |   red   |  green  |  blue   |  alpha  |
+`-----------------------------------------------------------------`
+8-bit tag b11111111
+8-bit   red channel value
+8-bit green channel value
+8-bit  blue channel value
+8-bit alpha channel value
+
+*/
+
+
+/* -----------------------------------------------------------------------------
+Header - Public functions */
+
+#ifndef QOI_H
+#define QOI_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions.
+It describes either the input format (for qoi_write and qoi_encode), or is
+filled with the description read from the file header (for qoi_read and
+qoi_decode).
+
+The colorspace in this qoi_desc is an enum where
+	0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel
+	1 = all channels are linear
+You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely
+informative. It will be saved to the file header, but does not affect
+how chunks are en-/decoded. */
+
+#define QOI_SRGB   0
+#define QOI_LINEAR 1
+
+typedef struct {
+	unsigned int width;
+	unsigned int height;
+	unsigned char channels;
+	unsigned char colorspace;
+} qoi_desc;
+
+#ifndef QOI_NO_STDIO
+
+/* Encode raw RGB or RGBA pixels into a QOI image and write it to the file
+system. The qoi_desc struct must be filled with the image width, height,
+number of channels (3 = RGB, 4 = RGBA) and the colorspace.
+
+The function returns 0 on failure (invalid parameters, or fopen or malloc
+failed) or the number of bytes written on success. */
+
+int qoi_write(const char *filename, const void *data, const qoi_desc *desc);
+
+
+/* Read and decode a QOI image from the file system. If channels is 0, the
+number of channels from the file header is used. If channels is 3 or 4 the
+output format will be forced into this number of channels.
+
+The function either returns NULL on failure (invalid data, or malloc or fopen
+failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
+will be filled with the description from the file header.
+
+The returned pixel data should be free()d after use. */
+
+void *qoi_read(const char *filename, qoi_desc *desc, int channels);
+
+#endif /* QOI_NO_STDIO */
+
+
+/* Encode raw RGB or RGBA pixels into a QOI image in memory.
+
+The function either returns NULL on failure (invalid parameters or malloc
+failed) or a pointer to the encoded data on success. On success the out_len
+is set to the size in bytes of the encoded data.
+
+The returned qoi data should be free()d after use. */
+
+void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len);
+
+
+/* Decode a QOI image from memory.
+
+The function either returns NULL on failure (invalid parameters or malloc
+failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
+is filled with the description from the file header.
+
+The returned pixel data should be free()d after use. */
+
+void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* QOI_H */
+
+
+/* -----------------------------------------------------------------------------
+Implementation */
+
+#ifdef QOI_IMPLEMENTATION
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef QOI_MALLOC
+	#define QOI_MALLOC(sz) malloc(sz)
+	#define QOI_FREE(p)    free(p)
+#endif
+#ifndef QOI_ZEROARR
+	#define QOI_ZEROARR(a) memset((a),0,sizeof(a))
+#endif
+
+#define QOI_OP_INDEX  0x00 /* 00xxxxxx */
+#define QOI_OP_DIFF   0x40 /* 01xxxxxx */
+#define QOI_OP_LUMA   0x80 /* 10xxxxxx */
+#define QOI_OP_RUN    0xc0 /* 11xxxxxx */
+#define QOI_OP_RGB    0xfe /* 11111110 */
+#define QOI_OP_RGBA   0xff /* 11111111 */
+
+#define QOI_MASK_2    0xc0 /* 11000000 */
+
+#define QOI_COLOR_HASH(C) (C.rgba.r*3 + C.rgba.g*5 + C.rgba.b*7 + C.rgba.a*11)
+#define QOI_MAGIC \
+	(((unsigned int)'q') << 24 | ((unsigned int)'o') << 16 | \
+	 ((unsigned int)'i') <<  8 | ((unsigned int)'f'))
+#define QOI_HEADER_SIZE 14
+
+/* 2GB is the max file size that this implementation can safely handle. We guard
+against anything larger than that, assuming the worst case with 5 bytes per
+pixel, rounded down to a nice clean value. 400 million pixels ought to be
+enough for anybody. */
+#define QOI_PIXELS_MAX ((unsigned int)400000000)
+
+typedef union {
+	struct { unsigned char r, g, b, a; } rgba;
+	unsigned int v;
+} qoi_rgba_t;
+
+static const unsigned char qoi_padding[8] = {0,0,0,0,0,0,0,1};
+
+static void qoi_write_32(unsigned char *bytes, int *p, unsigned int v) {
+	bytes[(*p)++] = (uint8_t)((0xff000000 & v) >> 24);
+	bytes[(*p)++] = (uint8_t)((0x00ff0000 & v) >> 16);
+	bytes[(*p)++] = (uint8_t)((0x0000ff00 & v) >> 8);
+	bytes[(*p)++] = (uint8_t)((0x000000ff & v));
+}
+
+static unsigned int qoi_read_32(const unsigned char *bytes, int *p) {
+	unsigned int a = bytes[(*p)++];
+	unsigned int b = bytes[(*p)++];
+	unsigned int c = bytes[(*p)++];
+	unsigned int d = bytes[(*p)++];
+	return a << 24 | b << 16 | c << 8 | d;
+}
+
+void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len) {
+	int i, max_size, p, run;
+	int px_len, px_end, px_pos, channels;
+	unsigned char *bytes;
+	const unsigned char *pixels;
+	qoi_rgba_t index[64];
+	qoi_rgba_t px, px_prev;
+
+	if (
+		data == NULL || out_len == NULL || desc == NULL ||
+		desc->width == 0 || desc->height == 0 ||
+		desc->channels < 3 || desc->channels > 4 ||
+		desc->colorspace > 1 ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
+	) {
+		return NULL;
+	}
+
+	max_size =
+		desc->width * desc->height * (desc->channels + 1) +
+		QOI_HEADER_SIZE + sizeof(qoi_padding);
+
+	p = 0;
+	bytes = (unsigned char *) QOI_MALLOC(max_size);
+	if (!bytes) {
+		return NULL;
+	}
+
+	qoi_write_32(bytes, &p, QOI_MAGIC);
+	qoi_write_32(bytes, &p, desc->width);
+	qoi_write_32(bytes, &p, desc->height);
+	bytes[p++] = desc->channels;
+	bytes[p++] = desc->colorspace;
+
+
+	pixels = (const unsigned char *)data;
+
+	QOI_ZEROARR(index);
+
+	run = 0;
+	px_prev.rgba.r = 0;
+	px_prev.rgba.g = 0;
+	px_prev.rgba.b = 0;
+	px_prev.rgba.a = 255;
+	px = px_prev;
+
+	px_len = desc->width * desc->height * desc->channels;
+	px_end = px_len - desc->channels;
+	channels = desc->channels;
+
+	for (px_pos = 0; px_pos < px_len; px_pos += channels) {
+		px.rgba.r = pixels[px_pos + 0];
+		px.rgba.g = pixels[px_pos + 1];
+		px.rgba.b = pixels[px_pos + 2];
+
+		if (channels == 4) {
+			px.rgba.a = pixels[px_pos + 3];
+		}
+
+		if (px.v == px_prev.v) {
+			run++;
+			if (run == 62 || px_pos == px_end) {
+				bytes[p++] = (uint8_t)(QOI_OP_RUN | (run - 1));
+				run = 0;
+			}
+		}
+		else {
+			int index_pos;
+
+			if (run > 0) {
+				bytes[p++] = (uint8_t)(QOI_OP_RUN | (run - 1));
+				run = 0;
+			}
+
+			index_pos = QOI_COLOR_HASH(px) % 64;
+
+			if (index[index_pos].v == px.v) {
+				bytes[p++] = (uint8_t)(QOI_OP_INDEX | index_pos);
+			}
+			else {
+				index[index_pos] = px;
+
+				if (px.rgba.a == px_prev.rgba.a) {
+					signed char vr = px.rgba.r - px_prev.rgba.r;
+					signed char vg = px.rgba.g - px_prev.rgba.g;
+					signed char vb = px.rgba.b - px_prev.rgba.b;
+
+					signed char vg_r = vr - vg;
+					signed char vg_b = vb - vg;
+
+					if (
+						vr > -3 && vr < 2 &&
+						vg > -3 && vg < 2 &&
+						vb > -3 && vb < 2
+					) {
+						bytes[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
+					}
+					else if (
+						vg_r >  -9 && vg_r <  8 &&
+						vg   > -33 && vg   < 32 &&
+						vg_b >  -9 && vg_b <  8
+					) {
+						bytes[p++] = QOI_OP_LUMA     | (vg   + 32);
+						bytes[p++] = (vg_r + 8) << 4 | (vg_b +  8);
+					}
+					else {
+						bytes[p++] = QOI_OP_RGB;
+						bytes[p++] = px.rgba.r;
+						bytes[p++] = px.rgba.g;
+						bytes[p++] = px.rgba.b;
+					}
+				}
+				else {
+					bytes[p++] = QOI_OP_RGBA;
+					bytes[p++] = px.rgba.r;
+					bytes[p++] = px.rgba.g;
+					bytes[p++] = px.rgba.b;
+					bytes[p++] = px.rgba.a;
+				}
+			}
+		}
+		px_prev = px;
+	}
+
+	for (i = 0; i < (int)sizeof(qoi_padding); i++) {
+		bytes[p++] = qoi_padding[i];
+	}
+
+	*out_len = p;
+	return bytes;
+}
+
+void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels) {
+	const unsigned char *bytes;
+	unsigned int header_magic;
+	unsigned char *pixels;
+	qoi_rgba_t index[64];
+	qoi_rgba_t px;
+	int px_len, chunks_len, px_pos;
+	int p = 0, run = 0;
+
+	if (
+		data == NULL || desc == NULL ||
+		(channels != 0 && channels != 3 && channels != 4) ||
+		size < QOI_HEADER_SIZE + (int)sizeof(qoi_padding)
+	) {
+		return NULL;
+	}
+
+	bytes = (const unsigned char *)data;
+
+	header_magic = qoi_read_32(bytes, &p);
+	desc->width = qoi_read_32(bytes, &p);
+	desc->height = qoi_read_32(bytes, &p);
+	desc->channels = bytes[p++];
+	desc->colorspace = bytes[p++];
+
+	if (
+		desc->width == 0 || desc->height == 0 ||
+		desc->channels < 3 || desc->channels > 4 ||
+		desc->colorspace > 1 ||
+		header_magic != QOI_MAGIC ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
+	) {
+		return NULL;
+	}
+
+	if (channels == 0) {
+		channels = desc->channels;
+	}
+
+	px_len = desc->width * desc->height * channels;
+	pixels = (unsigned char *) QOI_MALLOC(px_len);
+	if (!pixels) {
+		return NULL;
+	}
+
+	QOI_ZEROARR(index);
+	px.rgba.r = 0;
+	px.rgba.g = 0;
+	px.rgba.b = 0;
+	px.rgba.a = 255;
+
+	chunks_len = size - (int)sizeof(qoi_padding);
+	for (px_pos = 0; px_pos < px_len; px_pos += channels) {
+		if (run > 0) {
+			run--;
+		}
+		else if (p < chunks_len) {
+			int b1 = bytes[p++];
+
+			if (b1 == QOI_OP_RGB) {
+				px.rgba.r = bytes[p++];
+				px.rgba.g = bytes[p++];
+				px.rgba.b = bytes[p++];
+			}
+			else if (b1 == QOI_OP_RGBA) {
+				px.rgba.r = bytes[p++];
+				px.rgba.g = bytes[p++];
+				px.rgba.b = bytes[p++];
+				px.rgba.a = bytes[p++];
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) {
+				px = index[b1];
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) {
+				px.rgba.r += ((b1 >> 4) & 0x03) - 2;
+				px.rgba.g += ((b1 >> 2) & 0x03) - 2;
+				px.rgba.b += ( b1       & 0x03) - 2;
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) {
+				int b2 = bytes[p++];
+				int vg = (b1 & 0x3f) - 32;
+				px.rgba.r += (uint8_t)(vg - 8 + ((b2 >> 4) & 0x0f));
+				px.rgba.g += (uint8_t)(vg);
+				px.rgba.b += (uint8_t)(vg - 8 +  (b2       & 0x0f));
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
+				run = (b1 & 0x3f);
+			}
+
+			index[QOI_COLOR_HASH(px) % 64] = px;
+		}
+
+		pixels[px_pos + 0] = px.rgba.r;
+		pixels[px_pos + 1] = px.rgba.g;
+		pixels[px_pos + 2] = px.rgba.b;
+		
+		if (channels == 4) {
+			pixels[px_pos + 3] = px.rgba.a;
+		}
+	}
+
+	return pixels;
+}
+
+#ifndef QOI_NO_STDIO
+#include <stdio.h>
+
+int qoi_write(const char *filename, const void *data, const qoi_desc *desc) {
+#ifdef _MSC_VER
+	FILE* f = NULL;
+	fopen_s(&f, filename, "wb");
+#else
+	FILE *f = fopen(filename, "wb");
+#endif
+	int size, err;
+	void *encoded;
+
+	if (!f) {
+		return 0;
+	}
+
+	encoded = qoi_encode(data, desc, &size);
+	if (!encoded) {
+		fclose(f);
+		return 0;
+	}
+
+	fwrite(encoded, 1, size, f);
+	fflush(f);
+	err = ferror(f);
+	fclose(f);
+
+	QOI_FREE(encoded);
+	return err ? 0 : size;
+}
+
+void *qoi_read(const char *filename, qoi_desc *desc, int channels) {
+#ifdef _MSC_VER
+	FILE* f = NULL;
+	fopen_s(&f, filename, "rb");
+#else
+	FILE *f = fopen(filename, "rb");
+#endif
+	int size, bytes_read;
+	void *pixels, *data;
+
+	if (!f) {
+		return NULL;
+	}
+
+	fseek(f, 0, SEEK_END);
+	size = ftell(f);
+	if (size <= 0 || fseek(f, 0, SEEK_SET) != 0) {
+		fclose(f);
+		return NULL;
+	}
+
+	data = QOI_MALLOC(size);
+	if (!data) {
+		fclose(f);
+		return NULL;
+	}
+
+	bytes_read = (int)fread(data, 1, size, f);
+	fclose(f);
+	pixels = (bytes_read != size) ? NULL : qoi_decode(data, bytes_read, desc, channels);
+	QOI_FREE(data);
+	return pixels;
+}
+
+#endif /* QOI_NO_STDIO */
+#endif /* QOI_IMPLEMENTATION */
diff --git a/encoder/3rdparty/tinydds.h b/encoder/3rdparty/tinydds.h
new file mode 100644
index 0000000..41d9d7e
--- /dev/null
+++ b/encoder/3rdparty/tinydds.h
@@ -0,0 +1,2083 @@
+// MIT license see full LICENSE text at end of file
+#pragma once
+#ifndef TINY_DDS_TINYDDS_H
+#define TINY_DDS_TINYDDS_H
+
+#ifndef TINYDDS_HAVE_UINTXX_T
+#include <stdint.h>  // for uint32_t and int64_t
+#endif
+#ifndef TINYDDS_HAVE_BOOL
+#include <stdbool.h>  // for bool
+#endif
+#ifndef TINYDDS_HAVE_SIZE_T
+#include <stddef.h>    // for size_t
+#endif
+#ifndef TINYDDS_HAVE_MEMCPY
+#include <string.h>  // for memcpy
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TINYDDS_MAX_MIPMAPLEVELS 16
+
+typedef struct TinyDDS_Context *TinyDDS_ContextHandle;
+
+typedef void *(*TinyDDS_AllocFunc)(void *user, size_t size);
+typedef void (*TinyDDS_FreeFunc)(void *user, void *memory);
+typedef size_t (*TinyDDS_ReadFunc)(void *user, void *buffer, size_t byteCount);
+typedef bool (*TinyDDS_SeekFunc)(void *user, int64_t offset);
+typedef int64_t (*TinyDDS_TellFunc)(void *user);
+typedef void (*TinyDDS_ErrorFunc)(void *user, char const *msg);
+
+typedef struct TinyDDS_Callbacks {
+	TinyDDS_ErrorFunc errorFn;
+	TinyDDS_AllocFunc allocFn;
+	TinyDDS_FreeFunc freeFn;
+	TinyDDS_ReadFunc readFn;
+	TinyDDS_SeekFunc seekFn;
+	TinyDDS_TellFunc tellFn;
+} TinyDDS_Callbacks;
+
+TinyDDS_ContextHandle TinyDDS_CreateContext(TinyDDS_Callbacks const *callbacks, void *user);
+void TinyDDS_DestroyContext(TinyDDS_ContextHandle handle);
+
+// reset lets you reuse the context for another file (saves an alloc/free cycle)
+void TinyDDS_Reset(TinyDDS_ContextHandle handle);
+
+// call this to read the header file should already be at the start of the KTX data
+bool TinyDDS_ReadHeader(TinyDDS_ContextHandle handle);
+
+bool TinyDDS_Is1D(TinyDDS_ContextHandle handle);
+bool TinyDDS_Is2D(TinyDDS_ContextHandle handle);
+bool TinyDDS_Is3D(TinyDDS_ContextHandle handle);
+bool TinyDDS_IsCubemap(TinyDDS_ContextHandle handle);
+bool TinyDDS_IsArray(TinyDDS_ContextHandle handle);
+
+bool TinyDDS_Dimensions(TinyDDS_ContextHandle handle,
+												uint32_t *width,
+												uint32_t *height,
+												uint32_t *depth,
+												uint32_t *slices);
+uint32_t TinyDDS_Width(TinyDDS_ContextHandle handle);
+uint32_t TinyDDS_Height(TinyDDS_ContextHandle handle);
+uint32_t TinyDDS_Depth(TinyDDS_ContextHandle handle);
+uint32_t TinyDDS_ArraySlices(TinyDDS_ContextHandle handle);
+
+bool TinyDDS_NeedsGenerationOfMipmaps(TinyDDS_ContextHandle handle);
+bool TinyDDS_NeedsEndianCorrecting(TinyDDS_ContextHandle handle);
+
+uint32_t TinyDDS_NumberOfMipmaps(TinyDDS_ContextHandle handle);
+uint32_t TinyDDS_ImageSize(TinyDDS_ContextHandle handle, uint32_t mipmaplevel);
+
+// data return by ImageRawData is owned by the context. Don't free it!
+void const *TinyDDS_ImageRawData(TinyDDS_ContextHandle handle, uint32_t mipmaplevel);
+
+typedef void (*TinyDDS_WriteFunc)(void *user, void const *buffer, size_t byteCount);
+
+typedef struct TinyDDS_WriteCallbacks {
+	TinyDDS_ErrorFunc error;
+	TinyDDS_AllocFunc alloc;
+	TinyDDS_FreeFunc free;
+	TinyDDS_WriteFunc write;
+} TinyDDS_WriteCallbacks;
+
+#ifndef TINYIMAGEFORMAT_DXGIFORMAT
+#define TINYIMAGEFORMAT_DXGIFORMAT
+
+// early DDS was a direct copy of the Draw Draw surface bits, later on (Dx10) it moved to
+// DXGI_FORMAT we use a similar thing to DXGI_FORMAT second form but will synthesis
+// the old style when required when saving and vice versa when loading.
+typedef enum TinyImageFormat_DXGI_FORMAT {
+	TIF_DXGI_FORMAT_UNKNOWN = 0,
+	TIF_DXGI_FORMAT_R32G32B32A32_TYPELESS = 1,
+	TIF_DXGI_FORMAT_R32G32B32A32_FLOAT = 2,
+	TIF_DXGI_FORMAT_R32G32B32A32_UINT = 3,
+	TIF_DXGI_FORMAT_R32G32B32A32_SINT = 4,
+	TIF_DXGI_FORMAT_R32G32B32_TYPELESS = 5,
+	TIF_DXGI_FORMAT_R32G32B32_FLOAT = 6,
+	TIF_DXGI_FORMAT_R32G32B32_UINT = 7,
+	TIF_DXGI_FORMAT_R32G32B32_SINT = 8,
+	TIF_DXGI_FORMAT_R16G16B16A16_TYPELESS = 9,
+	TIF_DXGI_FORMAT_R16G16B16A16_FLOAT  = 10,
+	TIF_DXGI_FORMAT_R16G16B16A16_UNORM = 11,
+	TIF_DXGI_FORMAT_R16G16B16A16_UINT = 12,
+	TIF_DXGI_FORMAT_R16G16B16A16_SNORM = 13,
+	TIF_DXGI_FORMAT_R16G16B16A16_SINT = 14,
+	TIF_DXGI_FORMAT_R32G32_TYPELESS = 15,
+	TIF_DXGI_FORMAT_R32G32_FLOAT = 16,
+	TIF_DXGI_FORMAT_R32G32_UINT = 17,
+	TIF_DXGI_FORMAT_R32G32_SINT = 18,
+	TIF_DXGI_FORMAT_R32G8X24_TYPELESS = 19,
+	TIF_DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20,
+	TIF_DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21,
+	TIF_DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22,
+	TIF_DXGI_FORMAT_R10G10B10A2_TYPELESS = 23,
+	TIF_DXGI_FORMAT_R10G10B10A2_UNORM = 24,
+	TIF_DXGI_FORMAT_R10G10B10A2_UINT = 25,
+	TIF_DXGI_FORMAT_R11G11B10_FLOAT = 26,
+	TIF_DXGI_FORMAT_R8G8B8A8_TYPELESS = 27,
+	TIF_DXGI_FORMAT_R8G8B8A8_UNORM = 28,
+	TIF_DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29,
+	TIF_DXGI_FORMAT_R8G8B8A8_UINT = 30,
+	TIF_DXGI_FORMAT_R8G8B8A8_SNORM = 31,
+	TIF_DXGI_FORMAT_R8G8B8A8_SINT = 32,
+	TIF_DXGI_FORMAT_R16G16_TYPELESS = 33,
+	TIF_DXGI_FORMAT_R16G16_FLOAT = 34,
+	TIF_DXGI_FORMAT_R16G16_UNORM = 35,
+	TIF_DXGI_FORMAT_R16G16_UINT = 36,
+	TIF_DXGI_FORMAT_R16G16_SNORM = 37,
+	TIF_DXGI_FORMAT_R16G16_SINT = 38,
+	TIF_DXGI_FORMAT_R32_TYPELESS = 39,
+	TIF_DXGI_FORMAT_D32_FLOAT = 40,
+	TIF_DXGI_FORMAT_R32_FLOAT = 41,
+	TIF_DXGI_FORMAT_R32_UINT = 42,
+	TIF_DXGI_FORMAT_R32_SINT = 43,
+	TIF_DXGI_FORMAT_R24G8_TYPELESS = 44,
+	TIF_DXGI_FORMAT_D24_UNORM_S8_UINT = 45,
+	TIF_DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46,
+	TIF_DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47,
+	TIF_DXGI_FORMAT_R8G8_TYPELESS = 48,
+	TIF_DXGI_FORMAT_R8G8_UNORM = 49,
+	TIF_DXGI_FORMAT_R8G8_UINT = 50,
+	TIF_DXGI_FORMAT_R8G8_SNORM = 51,
+	TIF_DXGI_FORMAT_R8G8_SINT = 52,
+	TIF_DXGI_FORMAT_R16_TYPELESS = 53,
+	TIF_DXGI_FORMAT_R16_FLOAT = 54,
+	TIF_DXGI_FORMAT_D16_UNORM = 55,
+	TIF_DXGI_FORMAT_R16_UNORM = 56,
+	TIF_DXGI_FORMAT_R16_UINT = 57,
+	TIF_DXGI_FORMAT_R16_SNORM = 58,
+	TIF_DXGI_FORMAT_R16_SINT = 59,
+	TIF_DXGI_FORMAT_R8_TYPELESS = 60,
+	TIF_DXGI_FORMAT_R8_UNORM = 61,
+	TIF_DXGI_FORMAT_R8_UINT = 62,
+	TIF_DXGI_FORMAT_R8_SNORM = 63,
+	TIF_DXGI_FORMAT_R8_SINT = 64,
+	TIF_DXGI_FORMAT_A8_UNORM = 65,
+	TIF_DXGI_FORMAT_R1_UNORM = 66,
+	TIF_DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67,
+	TIF_DXGI_FORMAT_R8G8_B8G8_UNORM = 68,
+	TIF_DXGI_FORMAT_G8R8_G8B8_UNORM = 69,
+	TIF_DXGI_FORMAT_BC1_TYPELESS = 70,
+	TIF_DXGI_FORMAT_BC1_UNORM = 71,
+	TIF_DXGI_FORMAT_BC1_UNORM_SRGB = 72,
+	TIF_DXGI_FORMAT_BC2_TYPELESS = 73,
+	TIF_DXGI_FORMAT_BC2_UNORM = 74,
+	TIF_DXGI_FORMAT_BC2_UNORM_SRGB = 75,
+	TIF_DXGI_FORMAT_BC3_TYPELESS = 76,
+	TIF_DXGI_FORMAT_BC3_UNORM = 77,
+	TIF_DXGI_FORMAT_BC3_UNORM_SRGB = 78,
+	TIF_DXGI_FORMAT_BC4_TYPELESS = 79,
+	TIF_DXGI_FORMAT_BC4_UNORM = 80,
+	TIF_DXGI_FORMAT_BC4_SNORM = 81,
+	TIF_DXGI_FORMAT_BC5_TYPELESS = 82,
+	TIF_DXGI_FORMAT_BC5_UNORM = 83,
+	TIF_DXGI_FORMAT_BC5_SNORM = 84,
+	TIF_DXGI_FORMAT_B5G6R5_UNORM = 85,
+	TIF_DXGI_FORMAT_B5G5R5A1_UNORM = 86,
+	TIF_DXGI_FORMAT_B8G8R8A8_UNORM = 87,
+	TIF_DXGI_FORMAT_B8G8R8X8_UNORM = 88,
+	TIF_DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM = 89,
+	TIF_DXGI_FORMAT_B8G8R8A8_TYPELESS = 90,
+	TIF_DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91,
+	TIF_DXGI_FORMAT_B8G8R8X8_TYPELESS = 92,
+	TIF_DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93,
+	TIF_DXGI_FORMAT_BC6H_TYPELESS = 94,
+	TIF_DXGI_FORMAT_BC6H_UF16 = 95,
+	TIF_DXGI_FORMAT_BC6H_SF16 = 96,
+	TIF_DXGI_FORMAT_BC7_TYPELESS = 97,
+	TIF_DXGI_FORMAT_BC7_UNORM = 98,
+	TIF_DXGI_FORMAT_BC7_UNORM_SRGB = 99,
+	TIF_DXGI_FORMAT_AYUV = 100,
+	TIF_DXGI_FORMAT_Y410 = 101,
+	TIF_DXGI_FORMAT_Y416 = 102,
+	TIF_DXGI_FORMAT_NV12 = 103,
+	TIF_DXGI_FORMAT_P010 = 104,
+	TIF_DXGI_FORMAT_P016 = 105,
+	TIF_DXGI_FORMAT_420_OPAQUE = 106,
+	TIF_DXGI_FORMAT_YUY2 = 107,
+	TIF_DXGI_FORMAT_Y210 = 108,
+	TIF_DXGI_FORMAT_Y216 = 109,
+	TIF_DXGI_FORMAT_NV11 = 110,
+	TIF_DXGI_FORMAT_AI44 = 111,
+	TIF_DXGI_FORMAT_IA44 = 112,
+	TIF_DXGI_FORMAT_P8 = 113,
+	TIF_DXGI_FORMAT_A8P8 = 114,
+	TIF_DXGI_FORMAT_B4G4R4A4_UNORM = 115,
+
+	// xbox 360 formats
+	TIF_DXGI_FORMAT_R10G10B10_7E3_A2_FLOAT = 116,
+	TIF_DXGI_FORMAT_R10G10B10_6E4_A2_FLOAT = 117,
+	TIF_DXGI_FORMAT_D16_UNORM_S8_UINT = 118,
+	TIF_DXGI_FORMAT_R16_UNORM_X8_TYPELESS = 119,
+	TIF_DXGI_FORMAT_X16_TYPELESS_G8_UINT = 120,
+
+	TIF_DXGI_FORMAT_P208 = 130,
+	TIF_DXGI_FORMAT_V208 = 131,
+	TIF_DXGI_FORMAT_V408 = 132,
+
+	// XBox One formats
+	TIF_DXGI_FORMAT_R10G10B10_SNORM_A2_UNORM = 189,
+	TIF_DXGI_FORMAT_R4G4_UNORM = 190,
+
+} TinyImageFormat_DXGI_FORMAT;
+#endif
+
+typedef enum TinyDDS_Format {
+	TDDS_UNDEFINED = TIF_DXGI_FORMAT_UNKNOWN,
+	TDDS_B5G6R5_UNORM = TIF_DXGI_FORMAT_B5G6R5_UNORM,
+	TDDS_B5G5R5A1_UNORM = TIF_DXGI_FORMAT_B5G5R5A1_UNORM,
+	TDDS_R8_UNORM = TIF_DXGI_FORMAT_R8_UNORM,
+	TDDS_R8_SNORM = TIF_DXGI_FORMAT_R8_SNORM,
+	TDDS_A8_UNORM = TIF_DXGI_FORMAT_A8_UNORM,
+	TDDS_R1_UNORM = TIF_DXGI_FORMAT_R1_UNORM,
+	TDDS_R8_UINT = TIF_DXGI_FORMAT_R8_UINT,
+	TDDS_R8_SINT = TIF_DXGI_FORMAT_R8_SINT,
+	TDDS_R8G8_UNORM = TIF_DXGI_FORMAT_R8G8_UNORM,
+	TDDS_R8G8_SNORM = TIF_DXGI_FORMAT_R8G8_SNORM,
+	TDDS_R8G8_UINT = TIF_DXGI_FORMAT_R8G8_UINT,
+	TDDS_R8G8_SINT = TIF_DXGI_FORMAT_R8G8_SINT,
+	TDDS_R8G8B8A8_UNORM = TIF_DXGI_FORMAT_R8G8B8A8_UNORM,
+	TDDS_R8G8B8A8_SNORM = TIF_DXGI_FORMAT_R8G8B8A8_SNORM,
+	TDDS_R8G8B8A8_UINT = TIF_DXGI_FORMAT_R8G8B8A8_UINT,
+	TDDS_R8G8B8A8_SINT = TIF_DXGI_FORMAT_R8G8B8A8_SINT,
+	TDDS_R8G8B8A8_SRGB = TIF_DXGI_FORMAT_R8G8B8A8_UNORM_SRGB,
+	TDDS_B8G8R8A8_UNORM = TIF_DXGI_FORMAT_B8G8R8A8_UNORM,
+	TDDS_B8G8R8A8_SRGB = TIF_DXGI_FORMAT_B8G8R8A8_UNORM_SRGB,
+
+	TDDS_R9G9B9E5_UFLOAT = TIF_DXGI_FORMAT_R9G9B9E5_SHAREDEXP,
+	TDDS_R10G10B10A2_UNORM = TIF_DXGI_FORMAT_R10G10B10A2_UNORM,
+	TDDS_R10G10B10A2_UINT = TIF_DXGI_FORMAT_R10G10B10A2_UINT,
+	TDDS_R11G11B10_UFLOAT = TIF_DXGI_FORMAT_R11G11B10_FLOAT,
+
+	TDDS_R16_UNORM = TIF_DXGI_FORMAT_R16_UNORM,
+	TDDS_R16_SNORM = TIF_DXGI_FORMAT_R16_SNORM,
+	TDDS_R16_UINT = TIF_DXGI_FORMAT_R16_UINT,
+	TDDS_R16_SINT = TIF_DXGI_FORMAT_R16_SINT,
+	TDDS_R16_SFLOAT = TIF_DXGI_FORMAT_R16_FLOAT,
+
+	TDDS_R16G16_UNORM = TIF_DXGI_FORMAT_R16G16_UNORM,
+	TDDS_R16G16_SNORM = TIF_DXGI_FORMAT_R16G16_SNORM,
+	TDDS_R16G16_UINT = TIF_DXGI_FORMAT_R16G16_UINT,
+	TDDS_R16G16_SINT = TIF_DXGI_FORMAT_R16G16_SINT,
+	TDDS_R16G16_SFLOAT = TIF_DXGI_FORMAT_R16G16_FLOAT,
+
+	TDDS_R16G16B16A16_UNORM = TIF_DXGI_FORMAT_R16G16B16A16_UNORM,
+	TDDS_R16G16B16A16_SNORM = TIF_DXGI_FORMAT_R16G16B16A16_SNORM,
+	TDDS_R16G16B16A16_UINT = TIF_DXGI_FORMAT_R16G16B16A16_UINT,
+	TDDS_R16G16B16A16_SINT = TIF_DXGI_FORMAT_R16G16B16A16_SINT,
+	TDDS_R16G16B16A16_SFLOAT = TIF_DXGI_FORMAT_R16G16B16A16_FLOAT,
+
+	TDDS_R32_UINT = TIF_DXGI_FORMAT_R32_UINT,
+	TDDS_R32_SINT = TIF_DXGI_FORMAT_R32_SINT,
+	TDDS_R32_SFLOAT = TIF_DXGI_FORMAT_R32_FLOAT,
+
+	TDDS_R32G32_UINT = TIF_DXGI_FORMAT_R32G32_UINT,
+	TDDS_R32G32_SINT = TIF_DXGI_FORMAT_R32G32_SINT,
+	TDDS_R32G32_SFLOAT = TIF_DXGI_FORMAT_R32G32_FLOAT,
+
+	TDDS_R32G32B32_UINT = TIF_DXGI_FORMAT_R32G32B32_UINT,
+	TDDS_R32G32B32_SINT = TIF_DXGI_FORMAT_R32G32B32_SINT,
+	TDDS_R32G32B32_SFLOAT = TIF_DXGI_FORMAT_R32G32B32_FLOAT,
+
+	TDDS_R32G32B32A32_UINT = TIF_DXGI_FORMAT_R32G32B32A32_UINT,
+	TDDS_R32G32B32A32_SINT = TIF_DXGI_FORMAT_R32G32B32A32_SINT,
+	TDDS_R32G32B32A32_SFLOAT = TIF_DXGI_FORMAT_R32G32B32A32_FLOAT,
+
+	TDDS_BC1_RGBA_UNORM_BLOCK = TIF_DXGI_FORMAT_BC1_UNORM,
+	TDDS_BC1_RGBA_SRGB_BLOCK = TIF_DXGI_FORMAT_BC1_UNORM_SRGB,
+	TDDS_BC2_UNORM_BLOCK = TIF_DXGI_FORMAT_BC2_UNORM,
+	TDDS_BC2_SRGB_BLOCK = TIF_DXGI_FORMAT_BC2_UNORM_SRGB,
+	TDDS_BC3_UNORM_BLOCK = TIF_DXGI_FORMAT_BC3_UNORM,
+	TDDS_BC3_SRGB_BLOCK = TIF_DXGI_FORMAT_BC3_UNORM_SRGB,
+	TDDS_BC4_UNORM_BLOCK = TIF_DXGI_FORMAT_BC4_UNORM,
+	TDDS_BC4_SNORM_BLOCK = TIF_DXGI_FORMAT_BC4_SNORM,
+	TDDS_BC5_UNORM_BLOCK = TIF_DXGI_FORMAT_BC5_UNORM,
+	TDDS_BC5_SNORM_BLOCK = TIF_DXGI_FORMAT_BC5_SNORM,
+
+	TDDS_BC6H_UFLOAT_BLOCK = TIF_DXGI_FORMAT_BC6H_UF16,
+	TDDS_BC6H_SFLOAT_BLOCK = TIF_DXGI_FORMAT_BC6H_SF16,
+	TDDS_BC7_UNORM_BLOCK = TIF_DXGI_FORMAT_BC7_UNORM,
+	TDDS_BC7_SRGB_BLOCK = TIF_DXGI_FORMAT_BC7_UNORM_SRGB,
+
+	TDDS_AYUV = TIF_DXGI_FORMAT_AYUV,
+	TDDS_Y410 = TIF_DXGI_FORMAT_Y410,
+	TDDS_Y416 = TIF_DXGI_FORMAT_Y416,
+	TDDS_NV12 = TIF_DXGI_FORMAT_NV12,
+	TDDS_P010 = TIF_DXGI_FORMAT_P010,
+	TDDS_P016 = TIF_DXGI_FORMAT_P016,
+	TDDS_420_OPAQUE = TIF_DXGI_FORMAT_420_OPAQUE,
+	TDDS_YUY2 = TIF_DXGI_FORMAT_YUY2,
+	TDDS_Y210 = TIF_DXGI_FORMAT_Y210,
+	TDDS_Y216 = TIF_DXGI_FORMAT_Y216,
+	TDDS_NV11 = TIF_DXGI_FORMAT_NV11,
+	TDDS_AI44 = TIF_DXGI_FORMAT_AI44,
+	TDDS_IA44 = TIF_DXGI_FORMAT_IA44,
+	TDDS_P8 = TIF_DXGI_FORMAT_P8,
+	TDDS_A8P8 = TIF_DXGI_FORMAT_A8P8,
+	TDDS_B4G4R4A4_UNORM = TIF_DXGI_FORMAT_B4G4R4A4_UNORM,
+	TDDS_R10G10B10_7E3_A2_FLOAT = TIF_DXGI_FORMAT_R10G10B10_7E3_A2_FLOAT,
+	TDDS_R10G10B10_6E4_A2_FLOAT = TIF_DXGI_FORMAT_R10G10B10_6E4_A2_FLOAT,
+	TDDS_D16_UNORM_S8_UINT = TIF_DXGI_FORMAT_D16_UNORM_S8_UINT,
+	TDDS_R16_UNORM_X8_TYPELESS = TIF_DXGI_FORMAT_R16_UNORM_X8_TYPELESS,
+	TDDS_X16_TYPELESS_G8_UINT = TIF_DXGI_FORMAT_X16_TYPELESS_G8_UINT,
+	TDDS_P208 = TIF_DXGI_FORMAT_P208,
+	TDDS_V208 = TIF_DXGI_FORMAT_V208,
+	TDDS_V408 = TIF_DXGI_FORMAT_V408,
+	TDDS_R10G10B10_SNORM_A2_UNORM = TIF_DXGI_FORMAT_R10G10B10_SNORM_A2_UNORM,
+	TDDS_R4G4_UNORM = TIF_DXGI_FORMAT_R4G4_UNORM,
+
+	TDDS_SYNTHESISED_DXGIFORMATS = 0xFFFF,
+	TDDS_G4R4_UNORM = TDDS_SYNTHESISED_DXGIFORMATS,
+
+	TDDS_A4B4G4R4_UNORM,
+	TDDS_X4B4G4R4_UNORM,
+
+	TDDS_A4R4G4B4_UNORM,
+	TDDS_X4R4G4B4_UNORM,
+
+	TDDS_B4G4R4X4_UNORM,
+
+	TDDS_R4G4B4A4_UNORM,
+	TDDS_R4G4B4X4_UNORM,
+
+	TDDS_B5G5R5X1_UNORM,
+
+	TDDS_R5G5B5A1_UNORM,
+	TDDS_R5G5B5X1_UNORM,
+
+	TDDS_A1R5G5B5_UNORM,
+	TDDS_X1R5G5B5_UNORM,
+
+	TDDS_A1B5G5R5_UNORM,
+	TDDS_X1B5G5R5_UNORM,
+
+	TDDS_R5G6B5_UNORM,
+
+	TDDS_B2G3R3_UNORM,
+	TDDS_B2G3R3A8_UNORM,
+
+	TDDS_G8R8_UNORM,
+	TDDS_G8R8_SNORM,
+
+	TDDS_R8G8B8_UNORM,
+	TDDS_B8G8R8_UNORM,
+
+	TDDS_A8B8G8R8_SNORM,
+	TDDS_B8G8R8A8_SNORM,
+
+	TDDS_R8G8B8X8_UNORM,
+	TDDS_B8G8R8X8_UNORM,
+	TDDS_A8B8G8R8_UNORM,
+	TDDS_X8B8G8R8_UNORM,
+	TDDS_A8R8G8B8_UNORM,
+	TDDS_X8R8G8B8_UNORM,
+
+	TDDS_R10G10B10A2_SNORM,
+	TDDS_B10G10R10A2_UNORM,
+	TDDS_B10G10R10A2_SNORM,
+	TDDS_A2B10G10R10_UNORM,
+	TDDS_A2B10G10R10_SNORM,
+	TDDS_A2R10G10B10_UNORM,
+	TDDS_A2R10G10B10_SNORM,
+
+	TDDS_G16R16_UNORM,
+	TDDS_G16R16_SNORM,
+
+} TinyDDS_Format;
+
+// tiny_imageformat/format needs included before tinydds.h for this functionality
+#ifdef TINYIMAGEFORMAT_BASE_H_
+
+static TinyImageFormat TinyImageFormat_FromTinyDDSFormat(TinyDDS_Format fmt) {
+	switch (fmt) {
+	case TDDS_UNDEFINED: return TinyImageFormat_UNDEFINED;
+
+	case TDDS_R32G32B32A32_SFLOAT: return TinyImageFormat_R32G32B32A32_SFLOAT;
+	case TDDS_R32G32B32A32_UINT: return TinyImageFormat_R32G32B32A32_UINT;
+	case TDDS_R32G32B32A32_SINT: return TinyImageFormat_R32G32B32A32_SINT;
+	case TDDS_R32G32B32_SFLOAT: return TinyImageFormat_R32G32B32_SFLOAT;
+	case TDDS_R32G32B32_UINT: return TinyImageFormat_R32G32B32_UINT;
+	case TDDS_R32G32B32_SINT: return TinyImageFormat_R32G32B32_SINT;
+	case TDDS_R16G16B16A16_SFLOAT: return TinyImageFormat_R16G16B16A16_SFLOAT;
+	case TDDS_R16G16B16A16_UNORM: return TinyImageFormat_R16G16B16A16_UNORM;
+	case TDDS_R16G16B16A16_UINT: return TinyImageFormat_R16G16B16A16_UINT;
+	case TDDS_R16G16B16A16_SNORM: return TinyImageFormat_R16G16B16A16_SNORM;
+	case TDDS_R16G16B16A16_SINT: return TinyImageFormat_R16G16B16A16_SINT;
+	case TDDS_R32G32_SFLOAT: return TinyImageFormat_R32G32_SFLOAT;
+	case TDDS_R32G32_UINT: return TinyImageFormat_R32G32_UINT;
+	case TDDS_R32G32_SINT: return TinyImageFormat_R32G32_SINT;
+	case TDDS_R8G8B8A8_UNORM: return TinyImageFormat_R8G8B8A8_UNORM;
+	case TDDS_R8G8B8A8_SRGB: return TinyImageFormat_R8G8B8A8_SRGB;
+	case TDDS_R8G8B8A8_UINT: return TinyImageFormat_R8G8B8A8_UINT;
+	case TDDS_R8G8B8A8_SNORM: return TinyImageFormat_R8G8B8A8_SNORM;
+	case TDDS_R8G8B8A8_SINT: return TinyImageFormat_R8G8B8A8_SINT;
+	case TDDS_R16G16_SFLOAT: return TinyImageFormat_R16G16_SFLOAT;
+	case TDDS_R16G16_UNORM: return TinyImageFormat_R16G16_UNORM;
+	case TDDS_R16G16_UINT: return TinyImageFormat_R16G16_UINT;
+	case TDDS_R16G16_SNORM: return TinyImageFormat_R16G16_SNORM;
+	case TDDS_R16G16_SINT: return TinyImageFormat_R16G16_SINT;
+	case TDDS_R32_SFLOAT: return TinyImageFormat_R32_SFLOAT;
+	case TDDS_R32_UINT: return TinyImageFormat_R32_UINT;
+	case TDDS_R32_SINT: return TinyImageFormat_R32_SINT;
+
+	case TDDS_R8G8_UNORM: return TinyImageFormat_R8G8_UNORM;
+	case TDDS_R8G8_UINT: return TinyImageFormat_R8G8_UINT;
+	case TDDS_R8G8_SNORM: return TinyImageFormat_R8G8_SNORM;
+	case TDDS_R8G8_SINT: return TinyImageFormat_R8G8_SINT;
+	case TDDS_G8R8_UNORM: return TinyImageFormat_G8R8_UNORM;
+	case TDDS_G8R8_SNORM: return TinyImageFormat_G8R8_SNORM;
+
+	case TDDS_R16_SFLOAT: return TinyImageFormat_R16_SFLOAT;
+	case TDDS_R16_UNORM: return TinyImageFormat_R16_UNORM;
+	case TDDS_R16_UINT: return TinyImageFormat_R16_UINT;
+	case TDDS_R16_SNORM: return TinyImageFormat_R16_SNORM;
+	case TDDS_R16_SINT: return TinyImageFormat_R16_SINT;
+	case TDDS_R8_UNORM: return TinyImageFormat_R8_UNORM;
+	case TDDS_R8_UINT: return TinyImageFormat_R8_UINT;
+	case TDDS_R8_SNORM: return TinyImageFormat_R8_SNORM;
+	case TDDS_R8_SINT: return TinyImageFormat_R8_SINT;
+	case TDDS_A8_UNORM: return TinyImageFormat_A8_UNORM;
+	case TDDS_BC1_RGBA_UNORM_BLOCK: return TinyImageFormat_DXBC1_RGBA_UNORM;
+	case TDDS_BC1_RGBA_SRGB_BLOCK:			return TinyImageFormat_DXBC1_RGBA_SRGB;
+	case TDDS_BC2_UNORM_BLOCK: return TinyImageFormat_DXBC2_UNORM;
+	case TDDS_BC2_SRGB_BLOCK: return TinyImageFormat_DXBC2_SRGB;
+	case TDDS_BC3_UNORM_BLOCK: return TinyImageFormat_DXBC3_UNORM;
+	case TDDS_BC3_SRGB_BLOCK: return TinyImageFormat_DXBC3_SRGB;
+	case TDDS_BC4_UNORM_BLOCK: return TinyImageFormat_DXBC4_UNORM;
+	case TDDS_BC4_SNORM_BLOCK: return TinyImageFormat_DXBC4_SNORM;
+	case TDDS_BC5_UNORM_BLOCK: return TinyImageFormat_DXBC5_UNORM;
+	case TDDS_BC5_SNORM_BLOCK: return TinyImageFormat_DXBC5_SNORM;
+	case TDDS_BC6H_UFLOAT_BLOCK: return TinyImageFormat_DXBC6H_UFLOAT;
+	case TDDS_BC6H_SFLOAT_BLOCK: return TinyImageFormat_DXBC6H_SFLOAT;
+	case TDDS_BC7_UNORM_BLOCK: return TinyImageFormat_DXBC7_UNORM;
+	case TDDS_BC7_SRGB_BLOCK: return TinyImageFormat_DXBC7_SRGB;
+	case TDDS_B8G8R8A8_UNORM: return TinyImageFormat_B8G8R8A8_UNORM;
+	case TDDS_B8G8R8A8_SRGB: return TinyImageFormat_B8G8R8A8_SRGB;
+
+	case TDDS_B2G3R3A8_UNORM: return TinyImageFormat_B2G3R3A8_UNORM;
+	case TDDS_B2G3R3_UNORM: return TinyImageFormat_B2G3R3_UNORM;
+	case TDDS_R4G4_UNORM: return TinyImageFormat_R4G4_UNORM;
+
+	case TDDS_R8G8B8_UNORM: return TinyImageFormat_R8G8B8_UNORM;
+	case TDDS_B8G8R8_UNORM:	return TinyImageFormat_B8G8R8_UNORM;
+	case TDDS_B8G8R8A8_SNORM:					return TinyImageFormat_B8G8R8A8_SNORM;
+
+	case TDDS_R9G9B9E5_UFLOAT: return TinyImageFormat_E5B9G9R9_UFLOAT;
+	case TDDS_R11G11B10_UFLOAT: return TinyImageFormat_B10G11R11_UFLOAT;
+	case TDDS_G4R4_UNORM: return TinyImageFormat_G4R4_UNORM;
+
+	case TDDS_R5G6B5_UNORM: return TinyImageFormat_R5G6B5_UNORM;
+	case TDDS_B5G6R5_UNORM: return TinyImageFormat_B5G6R5_UNORM;
+
+	case TDDS_B5G5R5A1_UNORM: return TinyImageFormat_B5G5R5A1_UNORM;
+	case TDDS_B5G5R5X1_UNORM: return TinyImageFormat_B5G5R5X1_UNORM;
+
+	case TDDS_R5G5B5A1_UNORM: return TinyImageFormat_R5G5B5A1_UNORM;
+	case TDDS_R5G5B5X1_UNORM: return TinyImageFormat_R5G5B5X1_UNORM;
+
+	case TDDS_A1R5G5B5_UNORM: return TinyImageFormat_A1R5G5B5_UNORM;
+	case TDDS_X1R5G5B5_UNORM: return TinyImageFormat_X1R5G5B5_UNORM;
+
+	case TDDS_X1B5G5R5_UNORM: return TinyImageFormat_X1B5G5R5_UNORM;
+	case TDDS_A1B5G5R5_UNORM: return TinyImageFormat_A1B5G5R5_UNORM;
+
+	case TDDS_X4B4G4R4_UNORM: return TinyImageFormat_X4B4G4R4_UNORM;
+	case TDDS_X4R4G4B4_UNORM: return TinyImageFormat_X4R4G4B4_UNORM;
+	case TDDS_A4R4G4B4_UNORM:	return TinyImageFormat_A4R4G4B4_UNORM;
+	case TDDS_B4G4R4A4_UNORM: return TinyImageFormat_B4G4R4A4_UNORM;
+	case TDDS_A4B4G4R4_UNORM: return TinyImageFormat_A4B4G4R4_UNORM;
+	case TDDS_B4G4R4X4_UNORM: return TinyImageFormat_B4G4R4X4_UNORM;
+	case TDDS_R4G4B4A4_UNORM: return TinyImageFormat_R4G4B4A4_UNORM;
+	case TDDS_R4G4B4X4_UNORM:	return TinyImageFormat_R4G4B4X4_UNORM;
+
+	case TDDS_R8G8B8X8_UNORM: return TinyImageFormat_R8G8B8X8_UNORM;
+
+	// DDS A2R10B10G10 support is basically broken historically so expect channels to need swapping
+	case TDDS_A2B10G10R10_UNORM:	return TinyImageFormat_A2B10G10R10_UNORM;
+	case TDDS_A2B10G10R10_SNORM:	return TinyImageFormat_A2B10G10R10_SNORM;
+	case TDDS_A2R10G10B10_UNORM:	return TinyImageFormat_A2R10G10B10_UNORM;
+	case TDDS_A2R10G10B10_SNORM:	return TinyImageFormat_A2R10G10B10_SNORM;
+	case TDDS_B10G10R10A2_UNORM: 	return TinyImageFormat_R10G10B10A2_UNORM;
+	case TDDS_B10G10R10A2_SNORM: 	return TinyImageFormat_R10G10B10A2_SNORM;
+	case TDDS_R10G10B10A2_UNORM: 	return TinyImageFormat_B10G10R10A2_UNORM;
+	case TDDS_R10G10B10A2_SNORM: 	return TinyImageFormat_B10G10R10A2_SNORM;
+	case TDDS_R10G10B10A2_UINT: 	return TinyImageFormat_B10G10R10A2_UINT;
+
+	case TDDS_B8G8R8X8_UNORM: return TinyImageFormat_B8G8R8X8_UNORM;
+
+	case TDDS_G16R16_UNORM: return TinyImageFormat_G16R16_UNORM;
+	case TDDS_G16R16_SNORM: return TinyImageFormat_G16R16_SNORM;
+	case TDDS_X8B8G8R8_UNORM: return TinyImageFormat_R8G8B8X8_UNORM;
+	case TDDS_X8R8G8B8_UNORM: return TinyImageFormat_B8G8R8X8_UNORM;
+	case TDDS_A8B8G8R8_UNORM: return TinyImageFormat_R8G8B8A8_UNORM;
+	case TDDS_A8R8G8B8_UNORM: return TinyImageFormat_B8G8R8A8_UNORM;
+	case TDDS_A8B8G8R8_SNORM: return TinyImageFormat_R8G8B8X8_UNORM;
+	case TDDS_P8: return TinyImageFormat_CLUT_P8;
+	case TDDS_A8P8: return TinyImageFormat_CLUT_P8A8;
+	case TDDS_R1_UNORM: return TinyImageFormat_R1_UNORM;
+
+	case TDDS_AYUV:break;
+	case TDDS_Y410:break;
+	case TDDS_Y416:break;
+	case TDDS_NV12:break;
+	case TDDS_P010:break;
+	case TDDS_P016:break;
+	case TDDS_420_OPAQUE:break;
+	case TDDS_YUY2:break;
+	case TDDS_Y210:break;
+	case TDDS_Y216:break;
+	case TDDS_NV11:break;
+	case TDDS_AI44:break;
+	case TDDS_IA44:break;
+	case TDDS_R10G10B10_7E3_A2_FLOAT:break;
+	case TDDS_R10G10B10_6E4_A2_FLOAT:break;
+	case TDDS_D16_UNORM_S8_UINT:break;
+	case TDDS_R16_UNORM_X8_TYPELESS:break;
+	case TDDS_X16_TYPELESS_G8_UINT:break;
+	case TDDS_P208:break;
+	case TDDS_V208:break;
+	case TDDS_V408:break;
+	case TDDS_R10G10B10_SNORM_A2_UNORM:break;
+	}
+
+	return TinyImageFormat_UNDEFINED;
+}
+
+static TinyDDS_Format TinyImageFormat_ToTinyDDSFormat(TinyImageFormat fmt) {
+	switch (fmt) {
+	case TinyImageFormat_R4G4_UNORM: return TDDS_R4G4_UNORM;
+	case TinyImageFormat_G4R4_UNORM: return TDDS_G4R4_UNORM;
+
+	case TinyImageFormat_A4R4G4B4_UNORM: return TDDS_A4R4G4B4_UNORM;
+	case TinyImageFormat_B4G4R4A4_UNORM: return TDDS_B4G4R4A4_UNORM;
+	case TinyImageFormat_A4B4G4R4_UNORM: return TDDS_A4B4G4R4_UNORM;
+	case TinyImageFormat_X4R4G4B4_UNORM: return TDDS_X4R4G4B4_UNORM;
+	case TinyImageFormat_X4B4G4R4_UNORM: return TDDS_X4B4G4R4_UNORM;
+	case TinyImageFormat_R4G4B4A4_UNORM: return TDDS_R4G4B4A4_UNORM;
+	case TinyImageFormat_R4G4B4X4_UNORM: return TDDS_R4G4B4X4_UNORM;
+
+	case TinyImageFormat_A1B5G5R5_UNORM: return TDDS_A1B5G5R5_UNORM;
+	case TinyImageFormat_X1B5G5R5_UNORM: return TDDS_X1B5G5R5_UNORM;
+
+	case TinyImageFormat_A1R5G5B5_UNORM: return TDDS_A1R5G5B5_UNORM;
+	case TinyImageFormat_X1R5G5B5_UNORM: return TDDS_X1R5G5B5_UNORM;
+
+	case TinyImageFormat_B5G5R5A1_UNORM: return TDDS_B5G5R5A1_UNORM;
+	case TinyImageFormat_B5G5R5X1_UNORM: return TDDS_B5G5R5X1_UNORM;
+
+	case TinyImageFormat_R5G5B5A1_UNORM: return TDDS_R5G5B5A1_UNORM;
+	case TinyImageFormat_R5G5B5X1_UNORM: return TDDS_R5G5B5X1_UNORM;
+
+	case TinyImageFormat_R5G6B5_UNORM: return TDDS_R5G6B5_UNORM;
+	case TinyImageFormat_B5G6R5_UNORM: return TDDS_B5G6R5_UNORM;
+
+	case TinyImageFormat_A2B10G10R10_UNORM:	return TDDS_A2B10G10R10_UNORM;
+	case TinyImageFormat_A2B10G10R10_SNORM:	return TDDS_A2B10G10R10_SNORM;
+	case TinyImageFormat_A2R10G10B10_UNORM:	return TDDS_A2R10G10B10_UNORM;
+	case TinyImageFormat_A2R10G10B10_SNORM:	return TDDS_A2R10G10B10_SNORM;
+	case TinyImageFormat_R10G10B10A2_UNORM: return TDDS_B10G10R10A2_UNORM;
+	case TinyImageFormat_R10G10B10A2_SNORM: return TDDS_B10G10R10A2_SNORM;
+	case TinyImageFormat_B10G10R10A2_UNORM: return TDDS_R10G10B10A2_UNORM;
+	case TinyImageFormat_B10G10R10A2_SNORM: return TDDS_R10G10B10A2_SNORM;
+	case TinyImageFormat_B10G10R10A2_UINT: 	return TDDS_R10G10B10A2_UINT;
+
+	case TinyImageFormat_E5B9G9R9_UFLOAT: return TDDS_R9G9B9E5_UFLOAT;
+	case TinyImageFormat_B10G11R11_UFLOAT: return TDDS_R11G11B10_UFLOAT;
+
+	case TinyImageFormat_R8_UNORM: 				return TDDS_R8_UNORM;
+	case TinyImageFormat_R8_SNORM: 				return TDDS_R8_SNORM;
+	case TinyImageFormat_R8_UINT: 				return TDDS_R8_UINT;
+	case TinyImageFormat_R8_SINT: 				return TDDS_R8_SINT;
+	case TinyImageFormat_A8_UNORM: 				return TDDS_A8_UNORM;
+	case TinyImageFormat_B2G3R3_UNORM: 		return TDDS_B2G3R3_UNORM;
+
+	case TinyImageFormat_B2G3R3A8_UNORM: 	return TDDS_B2G3R3A8_UNORM;
+	case TinyImageFormat_R8G8_UNORM: 			return TDDS_R8G8_UNORM;
+	case TinyImageFormat_R8G8_SNORM: 			return TDDS_R8G8_SNORM;
+	case TinyImageFormat_R8G8_UINT: 			return TDDS_R8G8_UINT;
+	case TinyImageFormat_R8G8_SINT: 			return TDDS_R8G8_SINT;
+	case TinyImageFormat_G8R8_UNORM:			return TDDS_G8R8_UNORM;
+	case TinyImageFormat_G8R8_SNORM:			return TDDS_G8R8_SNORM;
+
+	case TinyImageFormat_R8G8B8_UNORM: 		return TDDS_R8G8B8_UNORM;
+	case TinyImageFormat_B8G8R8_UNORM:		return TDDS_B8G8R8_UNORM;
+
+	case TinyImageFormat_R8G8B8A8_UNORM: 	return TDDS_R8G8B8A8_UNORM;
+	case TinyImageFormat_R8G8B8A8_SNORM: 	return TDDS_R8G8B8A8_SNORM;
+	case TinyImageFormat_R8G8B8A8_UINT: 	return TDDS_R8G8B8A8_UINT;
+	case TinyImageFormat_R8G8B8A8_SINT: 	return TDDS_R8G8B8A8_SINT;
+	case TinyImageFormat_R8G8B8A8_SRGB: 	return TDDS_R8G8B8A8_SRGB;
+	case TinyImageFormat_B8G8R8A8_UNORM: 	return TDDS_B8G8R8A8_UNORM;
+	case TinyImageFormat_B8G8R8A8_SRGB:		return TDDS_B8G8R8A8_SRGB;
+
+	case TinyImageFormat_R16_UNORM:				return TDDS_R16_UNORM;
+	case TinyImageFormat_R16_SNORM:				return TDDS_R16_SNORM;
+	case TinyImageFormat_R16_UINT:				return TDDS_R16_UINT;
+	case TinyImageFormat_R16_SINT:				return TDDS_R16_SINT;
+	case TinyImageFormat_R16_SFLOAT:			return TDDS_R16_SFLOAT;
+
+	case TinyImageFormat_R16G16_UNORM:		return TDDS_R16G16_UNORM;
+	case TinyImageFormat_R16G16_SNORM:		return TDDS_R16G16_SNORM;
+	case TinyImageFormat_R16G16_UINT:			return TDDS_R16G16_UINT;
+	case TinyImageFormat_R16G16_SINT:			return TDDS_R16G16_SINT;
+	case TinyImageFormat_R16G16_SFLOAT:		return TDDS_R16G16_SFLOAT;
+
+	case TinyImageFormat_G16R16_UNORM:					return TDDS_G16R16_UNORM;
+	case TinyImageFormat_G16R16_SNORM:					return TDDS_G16R16_SNORM;
+
+	case TinyImageFormat_R16G16B16A16_UNORM:	return TDDS_R16G16B16A16_UNORM;
+	case TinyImageFormat_R16G16B16A16_SNORM:	return TDDS_R16G16B16A16_SNORM;
+	case TinyImageFormat_R16G16B16A16_UINT:		return TDDS_R16G16B16A16_UINT;
+	case TinyImageFormat_R16G16B16A16_SINT:		return TDDS_R16G16B16A16_SINT;
+	case TinyImageFormat_R16G16B16A16_SFLOAT:	return TDDS_R16G16B16A16_SFLOAT;
+
+	case TinyImageFormat_R32_UINT:				return TDDS_R32_UINT;
+	case TinyImageFormat_R32_SINT:				return TDDS_R32_SINT;
+	case TinyImageFormat_R32_SFLOAT:			return TDDS_R32_SFLOAT;
+
+	case TinyImageFormat_R32G32_UINT:			return TDDS_R32G32_UINT;
+	case TinyImageFormat_R32G32_SINT:			return TDDS_R32G32_SINT;
+	case TinyImageFormat_R32G32_SFLOAT:		return TDDS_R32G32_SFLOAT;
+
+	case TinyImageFormat_R32G32B32_UINT:	return TDDS_R32G32B32_UINT;
+	case TinyImageFormat_R32G32B32_SINT:	return TDDS_R32G32B32_SINT;
+	case TinyImageFormat_R32G32B32_SFLOAT:return TDDS_R32G32B32_SFLOAT;
+
+	case TinyImageFormat_R32G32B32A32_UINT:		return TDDS_R32G32B32A32_UINT;
+	case TinyImageFormat_R32G32B32A32_SINT:		return TDDS_R32G32B32A32_SINT;
+	case TinyImageFormat_R32G32B32A32_SFLOAT:	return TDDS_R32G32B32A32_SFLOAT;
+
+	case TinyImageFormat_D16_UNORM:				return TDDS_R16_UNORM;
+	case TinyImageFormat_D32_SFLOAT:			return TDDS_R32_SFLOAT;
+	case TinyImageFormat_S8_UINT:					return TDDS_R8_UINT;
+	case TinyImageFormat_DXBC1_RGB_UNORM: 	return TDDS_BC1_RGBA_UNORM_BLOCK;
+	case TinyImageFormat_DXBC1_RGB_SRGB:		return TDDS_BC1_RGBA_SRGB_BLOCK;
+	case TinyImageFormat_DXBC1_RGBA_UNORM:	return TDDS_BC1_RGBA_UNORM_BLOCK;
+	case TinyImageFormat_DXBC1_RGBA_SRGB:		return TDDS_BC1_RGBA_SRGB_BLOCK;
+	case TinyImageFormat_DXBC2_UNORM:				return TDDS_BC2_UNORM_BLOCK;
+	case TinyImageFormat_DXBC2_SRGB:				return TDDS_BC2_SRGB_BLOCK;
+	case TinyImageFormat_DXBC3_UNORM:				return TDDS_BC3_UNORM_BLOCK;
+	case TinyImageFormat_DXBC3_SRGB:				return TDDS_BC3_SRGB_BLOCK;
+	case TinyImageFormat_DXBC4_UNORM:				return TDDS_BC4_UNORM_BLOCK;
+	case TinyImageFormat_DXBC4_SNORM:				return TDDS_BC4_SNORM_BLOCK;
+	case TinyImageFormat_DXBC5_UNORM:				return TDDS_BC5_UNORM_BLOCK;
+	case TinyImageFormat_DXBC5_SNORM:				return TDDS_BC5_SNORM_BLOCK;
+	case TinyImageFormat_DXBC6H_UFLOAT:			return TDDS_BC6H_UFLOAT_BLOCK;
+	case TinyImageFormat_DXBC6H_SFLOAT:			return TDDS_BC6H_SFLOAT_BLOCK;
+	case TinyImageFormat_DXBC7_UNORM:				return TDDS_BC7_UNORM_BLOCK;
+	case TinyImageFormat_DXBC7_SRGB:				return TDDS_BC7_SRGB_BLOCK;
+
+	case TinyImageFormat_CLUT_P8: return TDDS_P8;
+	case TinyImageFormat_CLUT_P8A8: return TDDS_A8P8;
+	case TinyImageFormat_R1_UNORM: return TDDS_R1_UNORM;
+
+		// unsupported
+	// TODO Some of these can be via Dx10/4CC codes I think
+	default: return TDDS_UNDEFINED;
+	}
+
+	return TDDS_UNDEFINED;
+}
+#endif
+
+TinyDDS_Format TinyDDS_GetFormat(TinyDDS_ContextHandle handle);
+
+bool TinyDDS_WriteImage(TinyDDS_WriteCallbacks const *callbacks,
+												void *user,
+												uint32_t width,
+												uint32_t height,
+												uint32_t depth,
+												uint32_t slices,
+												uint32_t mipmaplevels,
+												TinyDDS_Format format,
+												bool cubemap,
+												bool preferDx10Format,
+												uint32_t const *mipmapsizes,
+												void const **mipmaps);
+
+#ifdef TINYDDS_IMPLEMENTATION
+
+#define TINYDDS_DDSD_CAPS 0x00000001
+#define TINYDDS_DDSD_HEIGHT 0x00000002
+#define TINYDDS_DDSD_WIDTH 0x00000004
+#define TINYDDS_DDSD_PITCH 0x00000008
+#define TINYDDS_DDSD_PIXELFORMAT 0x00001000
+#define TINYDDS_DDSD_MIPMAPCOUNT 0x00020000
+#define TINYDDS_DDSD_LINEARSIZE 0x00080000
+#define TINYDDS_DDSD_DEPTH 0x00800000
+#define TINYDDS_DDSCAPS_COMPLEX 0x00000008
+#define TINYDDS_DDSCAPS_TEXTURE 0x00001000
+#define TINYDDS_DDSCAPS_MIPMAP 0x00400000
+#define TINYDDS_DDSCAPS2_CUBEMAP 0x00000200
+#define TINYDDS_DDSCAPS2_VOLUME 0x00200000
+#define TINYDDS_DDSCAPS2_CUBEMAP_ALL 0x0000FC000
+#define TINYDDS_D3D10_RESOURCE_MISC_TEXTURECUBE 0x4
+#define TINYDDS_D3D10_RESOURCE_DIMENSION_BUFFER 1
+#define TINYDDS_D3D10_RESOURCE_DIMENSION_TEXTURE1D 2
+#define TINYDDS_D3D10_RESOURCE_DIMENSION_TEXTURE2D 3
+#define TINYDDS_D3D10_RESOURCE_DIMENSION_TEXTURE3D 4
+#define TINYDDS_DDPF_ALPHAPIXELS                        0x00000001l
+#define TINYDDS_DDPF_ALPHA                              0x00000002l
+#define TINYDDS_DDPF_FOURCC                             0x00000004l
+#define TINYDDS_DDPF_PALETTEINDEXED4                    0x00000008l
+#define TINYDDS_DDPF_PALETTEINDEXEDTO8                  0x00000010l
+#define TINYDDS_DDPF_PALETTEINDEXED8                    0x00000020l
+#define TINYDDS_DDPF_RGB                                0x00000040l
+#define TINYDDS_DDPF_LUMINANCE                          0x00020000l
+#define TINYDDS_DDPF_BUMPLUMINANCE                      0x00040000l
+#define TINYDDS_DDPF_BUMPDUDV                           0x00080000l
+
+// some of these get stuck in unofficial DDS v9 FourCC code
+typedef enum TINYDDS_D3DFORMAT {
+	TINYDDS_D3DFMT_UNKNOWN              =  0,
+	TINYDDS_D3DFMT_R8G8B8               = 20,
+	TINYDDS_D3DFMT_A8R8G8B8             = 21,
+	TINYDDS_D3DFMT_X8R8G8B8             = 22,
+	TINYDDS_D3DFMT_R5G6B5               = 23,
+	TINYDDS_D3DFMT_X1R5G5B5             = 24,
+	TINYDDS_D3DFMT_A1R5G5B5             = 25,
+	TINYDDS_D3DFMT_A4R4G4B4             = 26,
+	TINYDDS_D3DFMT_R3G3B2               = 27,
+	TINYDDS_D3DFMT_A8                   = 28,
+	TINYDDS_D3DFMT_A8R3G3B2             = 29,
+	TINYDDS_D3DFMT_X4R4G4B4             = 30,
+	TINYDDS_D3DFMT_A2B10G10R10          = 31,
+	TINYDDS_D3DFMT_A8B8G8R8             = 32,
+	TINYDDS_D3DFMT_X8B8G8R8             = 33,
+	TINYDDS_D3DFMT_G16R16               = 34,
+	TINYDDS_D3DFMT_A2R10G10B10          = 35,
+	TINYDDS_D3DFMT_A16B16G16R16         = 36,
+	TINYDDS_D3DFMT_A8P8                 = 40,
+	TINYDDS_D3DFMT_P8                   = 41,
+	TINYDDS_D3DFMT_L8                   = 50,
+	TINYDDS_D3DFMT_A8L8                 = 51,
+	TINYDDS_D3DFMT_A4L4                 = 52,
+	TINYDDS_D3DFMT_V8U8                 = 60,
+	TINYDDS_D3DFMT_L6V5U5               = 61,
+	TINYDDS_D3DFMT_X8L8V8U8             = 62,
+	TINYDDS_D3DFMT_Q8W8V8U8             = 63,
+	TINYDDS_D3DFMT_V16U16               = 64,
+	TINYDDS_D3DFMT_A2W10V10U10          = 67,
+	TINYDDS_D3DFMT_L16                  = 81,
+	TINYDDS_D3DFMT_Q16W16V16U16         = 110,
+	TINYDDS_D3DFMT_R16F                 = 111,
+	TINYDDS_D3DFMT_G16R16F              = 112,
+	TINYDDS_D3DFMT_A16B16G16R16F        = 113,
+	TINYDDS_D3DFMT_R32F                 = 114,
+	TINYDDS_D3DFMT_G32R32F              = 115,
+	TINYDDS_D3DFMT_A32B32G32R32F        = 116,
+	TINYDDS_D3DFMT_CxV8U8               = 117,
+	TINYDDS_D3DFMT_A1                   = 118,
+	TINYDDS_D3DFMT_A2B10G10R10_XR_BIAS  = 119,
+} TINYDDS_D3DFORMAT;
+
+typedef struct TinyDDS_Header {
+	uint32_t magic;
+	uint32_t size;
+	uint32_t flags;
+	uint32_t height;
+	uint32_t width;
+	uint32_t pitchOrLinearSize;
+	uint32_t depth;
+	uint32_t mipMapCount;
+	uint32_t reserved0[11];
+
+	uint32_t formatSize;
+	uint32_t formatFlags;
+	uint32_t formatFourCC;
+	uint32_t formatRGBBitCount;
+	uint32_t formatRBitMask;
+	uint32_t formatGBitMask;
+	uint32_t formatBBitMask;
+	uint32_t formatABitMask;
+
+	uint32_t caps1;
+	uint32_t caps2;
+	uint32_t caps3; // not used?
+	uint32_t caps4; // not used?
+
+	uint32_t reserved1;
+} TinyDDS_Header;
+
+typedef struct TinyDDS_HeaderDX10 {
+	uint32_t DXGIFormat;
+	uint32_t resourceDimension;
+	uint32_t miscFlag;
+	uint32_t arraySize;
+	uint32_t reserved;
+} TinyDDS_HeaderDX10;
+
+typedef struct TinyDDS_Context {
+	TinyDDS_Callbacks callbacks;
+	void *user;
+	uint64_t headerPos;
+	uint64_t firstImagePos;
+
+	TinyDDS_Header header;
+	TinyDDS_HeaderDX10 headerDx10;
+	TinyDDS_Format format;
+
+	bool headerValid;
+	uint8_t const *mipmaps[TINYDDS_MAX_MIPMAPLEVELS];
+	uint32_t const *clut;
+
+} TinyDDS_Context;
+
+#define TINYDDS_MAKE_RIFFCODE(a, b, c, d) (a | (b << 8) | (c << 16) | (d << 24))
+
+static uint32_t TinyDDS_fileIdentifier = TINYDDS_MAKE_RIFFCODE('D', 'D', 'S', ' ');
+
+static void TinyDDS_NullErrorFunc(void *user, char const *msg) { BASISU_NOTE_UNUSED(user); BASISU_NOTE_UNUSED(msg); }
+
+TinyDDS_ContextHandle TinyDDS_CreateContext(TinyDDS_Callbacks const *callbacks, void *user) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) callbacks->allocFn(user, sizeof(TinyDDS_Context));
+	if (ctx == NULL)
+		return NULL;
+
+	memset(ctx, 0, sizeof(TinyDDS_Context));
+	memcpy(&ctx->callbacks, callbacks, sizeof(TinyDDS_Callbacks));
+	ctx->user = user;
+	if (ctx->callbacks.errorFn == NULL) {
+		ctx->callbacks.errorFn = &TinyDDS_NullErrorFunc;
+	}
+
+	if (ctx->callbacks.readFn == NULL) {
+		ctx->callbacks.errorFn(user, "TinyDDS must have read callback");
+		return NULL;
+	}
+	if (ctx->callbacks.allocFn == NULL) {
+		ctx->callbacks.errorFn(user, "TinyDDS must have alloc callback");
+		return NULL;
+	}
+	if (ctx->callbacks.freeFn == NULL) {
+		ctx->callbacks.errorFn(user, "TinyDDS must have free callback");
+		return NULL;
+	}
+	if (ctx->callbacks.seekFn == NULL) {
+		ctx->callbacks.errorFn(user, "TinyDDS must have seek callback");
+		return NULL;
+	}
+	if (ctx->callbacks.tellFn == NULL) {
+		ctx->callbacks.errorFn(user, "TinyDDS must have tell callback");
+		return NULL;
+	}
+
+	TinyDDS_Reset(ctx);
+
+	return ctx;
+}
+
+void TinyDDS_DestroyContext(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return;
+	TinyDDS_Reset(handle);
+
+	ctx->callbacks.freeFn(ctx->user, ctx);
+}
+
+void TinyDDS_Reset(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return;
+
+	// backup user provided callbacks and data
+	TinyDDS_Callbacks callbacks;
+	memcpy(&callbacks, &ctx->callbacks, sizeof(TinyDDS_Callbacks));
+	void *user = ctx->user;
+
+	for (int i = 0; i < TINYDDS_MAX_MIPMAPLEVELS; ++i) {
+		if (ctx->mipmaps[i] != NULL) {
+			callbacks.freeFn(user, (void *) ctx->mipmaps[i]);
+		}
+	}
+
+	if(ctx->clut) {
+		callbacks.freeFn(user, (void *) ctx->clut);
+		ctx->clut = NULL;
+	}
+
+	// reset to default state
+	memset(ctx, 0, sizeof(TinyDDS_Context));
+	memcpy(&ctx->callbacks, &callbacks, sizeof(TinyDDS_Callbacks));
+	ctx->user = user;
+
+}
+
+static bool TinyDDS_IsCLUT(TinyDDS_Format fmt) {
+	switch (fmt) {
+	case TDDS_P8:
+	case TDDS_A8P8:
+		return true;
+	default: return false;
+	}
+}
+
+static bool TinyDDS_IsCompressed(TinyDDS_Format fmt) {
+	switch (fmt) {
+	case TDDS_BC1_RGBA_UNORM_BLOCK:
+	case TDDS_BC1_RGBA_SRGB_BLOCK:
+	case TDDS_BC2_UNORM_BLOCK:
+	case TDDS_BC2_SRGB_BLOCK:
+	case TDDS_BC3_UNORM_BLOCK:
+	case TDDS_BC3_SRGB_BLOCK:
+	case TDDS_BC4_UNORM_BLOCK:
+	case TDDS_BC4_SNORM_BLOCK:
+	case TDDS_BC5_UNORM_BLOCK:
+	case TDDS_BC5_SNORM_BLOCK:
+	case TDDS_BC6H_UFLOAT_BLOCK:
+	case TDDS_BC6H_SFLOAT_BLOCK:
+	case TDDS_BC7_UNORM_BLOCK:
+	case TDDS_BC7_SRGB_BLOCK: return true;
+	default: return false;
+	}
+}
+
+// the size is per pixel (except R1) for uncompressed and per block of 16 pixels for compressed
+static uint32_t TinyDDS_FormatSize(TinyDDS_Format fmt) {
+	switch(fmt) {
+	// 8 pixels at 1 bits each
+	case TDDS_R1_UNORM:
+		return 1;
+		// 2 * 4 bits
+	case TDDS_R4G4_UNORM:
+	case TDDS_G4R4_UNORM:
+		// 1 * 8 bits
+	case TDDS_P8:;
+	case TDDS_R8_UNORM:
+	case TDDS_R8_SNORM:
+	case TDDS_R8_UINT:
+	case TDDS_R8_SINT:
+	case TDDS_A8_UNORM:
+	// 2 + 2 * 3 bits
+	case TDDS_B2G3R3_UNORM:
+		return 1;
+
+		// 2 + 2 * 3 +8 bits
+	case TDDS_B2G3R3A8_UNORM:
+	// 4 * 4 bits
+	case TDDS_B4G4R4A4_UNORM:
+	case TDDS_A4B4G4R4_UNORM:
+	case TDDS_X4B4G4R4_UNORM:
+	case TDDS_A4R4G4B4_UNORM:
+	case TDDS_X4R4G4B4_UNORM:
+	case TDDS_B4G4R4X4_UNORM:
+	case TDDS_R4G4B4A4_UNORM:
+	case TDDS_R4G4B4X4_UNORM:
+
+		// 3 * 5 bits + 1 bit
+	case TDDS_B5G5R5A1_UNORM:
+	case TDDS_B5G5R5X1_UNORM:
+	case TDDS_R5G5B5A1_UNORM:
+	case TDDS_R5G5B5X1_UNORM:
+	case TDDS_A1R5G5B5_UNORM:
+	case TDDS_X1R5G5B5_UNORM:
+	case TDDS_A1B5G5R5_UNORM:
+	case TDDS_X1B5G5R5_UNORM:
+
+		// 1 * 6 bit + 2 * 5 bits
+	case TDDS_R5G6B5_UNORM:
+	case TDDS_B5G6R5_UNORM:
+		// 2 x 8 bits
+	case TDDS_A8P8:
+	case TDDS_R8G8_UNORM:
+	case TDDS_R8G8_SNORM:
+	case TDDS_G8R8_UNORM:
+	case TDDS_G8R8_SNORM:
+	case TDDS_R8G8_UINT:
+	case TDDS_R8G8_SINT:
+		// 1 * 16 bits
+	case TDDS_R16_UNORM:
+	case TDDS_R16_SNORM:
+	case TDDS_R16_UINT:
+	case TDDS_R16_SINT:
+	case TDDS_R16_SFLOAT:
+		return 2;
+
+		// 3 * 8 bits
+	case TDDS_R8G8B8_UNORM:
+	case TDDS_B8G8R8_UNORM:
+		return 3;
+	// 4 * 8 bits
+	case TDDS_A8B8G8R8_SNORM:
+	case TDDS_R8G8B8A8_SNORM:
+	case TDDS_R8G8B8A8_UINT:
+	case TDDS_R8G8B8A8_SINT:
+	case TDDS_R8G8B8A8_SRGB:
+	case TDDS_B8G8R8A8_SRGB:
+	case TDDS_B8G8R8A8_SNORM:
+
+	case TDDS_R8G8B8A8_UNORM:
+	case TDDS_R8G8B8X8_UNORM:
+	case TDDS_B8G8R8A8_UNORM:
+	case TDDS_B8G8R8X8_UNORM:
+	case TDDS_A8B8G8R8_UNORM:
+	case TDDS_X8B8G8R8_UNORM:
+	case TDDS_A8R8G8B8_UNORM:
+	case TDDS_X8R8G8B8_UNORM:
+
+		// 3 * 9 bits + 5 bits
+	case TDDS_R9G9B9E5_UFLOAT:
+		// 3 * 10 bits + 2 bits
+	case TDDS_R10G10B10_7E3_A2_FLOAT:
+	case TDDS_R10G10B10_6E4_A2_FLOAT:
+	case TDDS_R10G10B10_SNORM_A2_UNORM:
+
+	case TDDS_B10G10R10A2_UNORM:
+	case TDDS_B10G10R10A2_SNORM:
+	case TDDS_A2B10G10R10_UNORM:
+	case TDDS_A2B10G10R10_SNORM:
+	case TDDS_A2R10G10B10_UNORM:
+	case TDDS_A2R10G10B10_SNORM:
+	case TDDS_R10G10B10A2_UNORM:
+	case TDDS_R10G10B10A2_SNORM:
+	case TDDS_R10G10B10A2_UINT:
+
+		// 2 * 11 bits + 10 bits
+	case TDDS_R11G11B10_UFLOAT:
+		// 2 * 16 bits
+	case TDDS_R16G16_UNORM:
+	case TDDS_R16G16_SNORM:
+	case TDDS_R16G16_UINT:
+	case TDDS_R16G16_SINT:
+	case TDDS_R16G16_SFLOAT:
+	case TDDS_G16R16_UNORM:
+	case TDDS_G16R16_SNORM:
+		// 1 * 32 bits
+	case TDDS_R32_UINT:
+	case TDDS_R32_SINT:
+	case TDDS_R32_SFLOAT:
+		return 4;
+		// 4 * 16 bits
+	case TDDS_R16G16B16A16_UNORM:
+	case TDDS_R16G16B16A16_SNORM:
+	case TDDS_R16G16B16A16_UINT:
+	case TDDS_R16G16B16A16_SINT:
+	case TDDS_R16G16B16A16_SFLOAT:
+		// 2 * 32 bits
+	case TDDS_R32G32_UINT:
+	case TDDS_R32G32_SINT:
+	case TDDS_R32G32_SFLOAT:
+		return 8;
+		// 3 * 32 bits
+	case TDDS_R32G32B32_UINT:
+	case TDDS_R32G32B32_SINT:
+	case TDDS_R32G32B32_SFLOAT:
+		return 12;
+		// 4 * 32 bits
+	case TDDS_R32G32B32A32_UINT:
+	case TDDS_R32G32B32A32_SINT:
+	case TDDS_R32G32B32A32_SFLOAT:
+		return 16;
+		// block formats
+	case TDDS_BC1_RGBA_UNORM_BLOCK:
+	case TDDS_BC1_RGBA_SRGB_BLOCK:
+	case TDDS_BC4_UNORM_BLOCK:
+	case TDDS_BC4_SNORM_BLOCK:
+		return 8;
+
+ 	case TDDS_BC2_UNORM_BLOCK:
+	case TDDS_BC2_SRGB_BLOCK:
+	case TDDS_BC3_UNORM_BLOCK:
+	case TDDS_BC3_SRGB_BLOCK:
+	case TDDS_BC5_UNORM_BLOCK:
+	case TDDS_BC5_SNORM_BLOCK:
+	case TDDS_BC6H_UFLOAT_BLOCK:
+	case TDDS_BC6H_SFLOAT_BLOCK:
+	case TDDS_BC7_UNORM_BLOCK:
+	case TDDS_BC7_SRGB_BLOCK:
+		return 16;
+
+	case TDDS_UNDEFINED: return 0;
+		//	default: return 0;
+	case TDDS_AYUV:break;
+	case TDDS_Y410:break;
+	case TDDS_Y416:break;
+	case TDDS_NV12:break;
+	case TDDS_P010:break;
+	case TDDS_P016:break;
+	case TDDS_420_OPAQUE:break;
+	case TDDS_YUY2:break;
+	case TDDS_Y210:break;
+	case TDDS_Y216:break;
+	case TDDS_NV11:break;
+	case TDDS_AI44:break;
+	case TDDS_IA44:break;
+	case TDDS_D16_UNORM_S8_UINT:break;
+	case TDDS_R16_UNORM_X8_TYPELESS:break;
+	case TDDS_X16_TYPELESS_G8_UINT:break;
+	case TDDS_P208:break;
+	case TDDS_V208:break;
+	case TDDS_V408:break;
+	}
+	return 0;
+}
+
+#define TINYDDS_CHK_DDSFORMAT(bits, rm, gm, bm, am, fmt) \
+			if ((ctx->header.formatRGBBitCount == bits) && \
+					(ctx->header.formatRBitMask == rm) && \
+					(ctx->header.formatGBitMask == gm) && \
+					(ctx->header.formatBBitMask == bm) && \
+					(ctx->header.formatABitMask == am)) { return fmt; }
+
+static TinyDDS_Format TinyDDS_DecodeFormat(TinyDDS_Context *ctx) {
+	if (ctx->header.formatFlags & TINYDDS_DDPF_FOURCC) {
+		if (ctx->headerDx10.DXGIFormat != TIF_DXGI_FORMAT_UNKNOWN) {
+			return (TinyDDS_Format) ctx->headerDx10.DXGIFormat;
+		}
+
+		// check fourCC and some special numbers..
+		// unofficially during the dx9 timeline, D3D_FORMAT were stuck directly into
+		// formatFourCC field we handle FourCC and these < 119 codes here
+		// its unclear if this was only for formats that couldn't be exposed via
+		// Direct Draw Surfaces (like floats etc.) so I decode most of them anyway
+		switch (ctx->header.formatFourCC) {
+		case TINYDDS_D3DFMT_R8G8B8: return TDDS_R8G8B8_UNORM;
+		case TINYDDS_D3DFMT_A8R8G8B8: return TDDS_A8R8G8B8_UNORM;
+		case TINYDDS_D3DFMT_X8R8G8B8: return TDDS_X8R8G8B8_UNORM;
+		case TINYDDS_D3DFMT_R5G6B5: return TDDS_R5G6B5_UNORM;
+		case TINYDDS_D3DFMT_X1R5G5B5: return TDDS_X1R5G5B5_UNORM;
+		case TINYDDS_D3DFMT_A1R5G5B5: return TDDS_A1R5G5B5_UNORM;
+		case TINYDDS_D3DFMT_A4R4G4B4: return TDDS_A4R4G4B4_UNORM;
+		case TINYDDS_D3DFMT_R3G3B2: return TDDS_B2G3R3_UNORM;
+		case TINYDDS_D3DFMT_A8: return TDDS_A8_UNORM;
+		case TINYDDS_D3DFMT_A8R3G3B2: return TDDS_B2G3R3A8_UNORM;
+		case TINYDDS_D3DFMT_X4R4G4B4: return TDDS_A4R4G4B4_UNORM;
+		case TINYDDS_D3DFMT_A2B10G10R10: return TDDS_A2B10G10R10_UNORM;
+		case TINYDDS_D3DFMT_A8B8G8R8: return TDDS_A8B8G8R8_UNORM;
+		case TINYDDS_D3DFMT_X8B8G8R8: return TDDS_A8B8G8R8_UNORM;
+		case TINYDDS_D3DFMT_A2R10G10B10: return TDDS_A2R10G10B10_UNORM;
+		case TINYDDS_D3DFMT_G16R16: return TDDS_R16G16_UNORM;
+		case TINYDDS_D3DFMT_A16B16G16R16: return TDDS_R16G16B16A16_UNORM;
+		case TINYDDS_D3DFMT_R16F: return TDDS_R16_SFLOAT;
+		case TINYDDS_D3DFMT_G16R16F: return TDDS_R16G16_SFLOAT;
+		case TINYDDS_D3DFMT_A16B16G16R16F: return TDDS_R16G16B16A16_SFLOAT;
+		case TINYDDS_D3DFMT_A8P8: return TDDS_A8P8;
+		case TINYDDS_D3DFMT_P8: return TDDS_P8;
+		case TINYDDS_D3DFMT_L8: return TDDS_R8_UNORM;
+		case TINYDDS_D3DFMT_A8L8: return TDDS_R8G8_UNORM;
+		case TINYDDS_D3DFMT_A4L4: return TDDS_R4G4_UNORM;
+		case TINYDDS_D3DFMT_V8U8: return TDDS_G8R8_SNORM;
+		case TINYDDS_D3DFMT_L6V5U5: return TDDS_UNDEFINED; // TODO TDDS_R5G6B5_SNORM_PACK16;
+		case TINYDDS_D3DFMT_X8L8V8U8: return TDDS_R8G8B8A8_SNORM;
+		case TINYDDS_D3DFMT_Q8W8V8U8: return TDDS_R8G8B8A8_SNORM;
+		case TINYDDS_D3DFMT_V16U16: return TDDS_R16G16_SNORM;
+		case TINYDDS_D3DFMT_A2W10V10U10: return TDDS_A2B10G10R10_SNORM;
+		case TINYDDS_D3DFMT_L16: return TDDS_R16_UNORM;
+		case TINYDDS_D3DFMT_Q16W16V16U16: return TDDS_R16G16B16A16_SNORM;
+		case TINYDDS_D3DFMT_R32F: return TDDS_R32_SFLOAT;
+		case TINYDDS_D3DFMT_G32R32F: return TDDS_R32G32_SFLOAT;
+		case TINYDDS_D3DFMT_A32B32G32R32F: return TDDS_R32G32B32A32_SFLOAT;
+		case TINYDDS_D3DFMT_CxV8U8: return TDDS_UNDEFINED;
+		case TINYDDS_D3DFMT_A1: return TDDS_R1_UNORM;
+		case TINYDDS_D3DFMT_A2B10G10R10_XR_BIAS: return TDDS_UNDEFINED;
+
+			// real 4CC no exotics yet just the block compression ones
+		case TINYDDS_MAKE_RIFFCODE('D', 'X', 'T', '1'): return TDDS_BC1_RGBA_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('D', 'X', 'T', '2'): return TDDS_BC2_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('D', 'X', 'T', '3'): return TDDS_BC2_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('D', 'X', 'T', '4'): return TDDS_BC3_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('D', 'X', 'T', '5'): return TDDS_BC3_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('A', 'T', 'I', '1'): return TDDS_BC4_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('A', 'T', 'I', '2'): return TDDS_BC5_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('B', 'C', '4', 'U'): return TDDS_BC4_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('B', 'C', '4', 'S'): return TDDS_BC4_SNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('B', 'C', '5', 'U'): return TDDS_BC5_UNORM_BLOCK;
+		case TINYDDS_MAKE_RIFFCODE('B', 'C', '5', 'S'): return TDDS_BC5_SNORM_BLOCK;
+		}
+	}
+
+	// okay back to direct draw surface bit fields to try and work format out.
+	// TODO this could be better i'm sure
+
+	if ((ctx->header.formatFlags & TINYDDS_DDPF_PALETTEINDEXED4)) {
+		return TDDS_UNDEFINED; // TODO 4 bit CLUTs
+	}
+
+	if ((ctx->header.formatFlags & TINYDDS_DDPF_PALETTEINDEXED8)) {
+		if(ctx->header.formatRGBBitCount != 8) return TDDS_UNDEFINED;
+		if(ctx->header.formatFlags & TINYDDS_DDPF_ALPHA) {
+			return TDDS_A8P8;
+		} else {
+			return TDDS_P8;
+		}
+	}
+	// what is this? TINYDDS_DDPF_PALETTEINDEXEDTO8
+
+	// most have RGB data and/or alpha
+	if ((ctx->header.formatFlags & TINYDDS_DDPF_RGB) ||
+			(ctx->header.formatFlags & TINYDDS_DDPF_ALPHA)) {
+
+		TINYDDS_CHK_DDSFORMAT(1, 0x1, 0x0, 0, 0, TDDS_R1_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(8, 0xF0, 0x0F, 0, 0, TDDS_G4R4_UNORM);
+		TINYDDS_CHK_DDSFORMAT(8, 0x0F, 0xF0, 0, 0, TDDS_R4G4_UNORM);
+		TINYDDS_CHK_DDSFORMAT(8, 0xFF, 0, 0, 0, TDDS_R8_UNORM);
+		TINYDDS_CHK_DDSFORMAT(8, 0, 0, 0, 0xFF, TDDS_A8_UNORM);
+		TINYDDS_CHK_DDSFORMAT(8, 0xE0, 0x1C, 0x3, 0, TDDS_B2G3R3_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0xF000, 0x0F00, 0x00F0, 0x000F, TDDS_A4B4G4R4_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0xF000, 0x0F00, 0x00F0, 0x0000, TDDS_X4B4G4R4_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0x00F0, 0x0F00, 0xF000, 0x000F, TDDS_A4R4G4B4_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0x00F0, 0x0F00, 0xF000, 0x0000, TDDS_X4R4G4B4_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0x0F00, 0x00F0, 0x000F, 0xF000, TDDS_B4G4R4A4_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0x0F00, 0x00F0, 0x000F, 0x0000, TDDS_B4G4R4X4_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0x000F, 0x00F0, 0x0F00, 0xF000, TDDS_R4G4B4A4_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0x000F, 0x00F0, 0x0F00, 0x0000, TDDS_R4G4B4X4_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0x7C00, 0x03E0, 0x001F, 0x8000, TDDS_B5G5R5A1_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0x7C00, 0x03E0, 0x001F, 0x0000, TDDS_B5G5R5X1_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0x001F, 0x03E0, 0x7C00, 0x8000, TDDS_R5G5B5A1_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0x001F, 0x03E0, 0x7C00, 0x0000, TDDS_R5G5B5X1_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0x003E, 0x07C0, 0xF800, 0x0001, TDDS_A1R5G5B5_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0x003E, 0x07C0, 0xF800, 0x0000, TDDS_X1R5G5B5_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0xF800, 0x07C0, 0x003E, 0x0001, TDDS_A1B5G5R5_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0xF800, 0x07C0, 0x003E, 0x0000, TDDS_X1B5G5R5_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0xF800, 0x07E0, 0x001F, 0x0000, TDDS_B5G6R5_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0x001F, 0x07E0, 0xF800, 0x0000, TDDS_R5G6B5_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0x00FF, 0xFF00, 0x0000, 0x0000, TDDS_R8G8_UNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0xFF00, 0x00FF, 0x0000, 0x0000, TDDS_G8R8_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0xFFFF, 0x0000, 0x0000, 0x0000, TDDS_R16_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(16, 0xE0, 0x1C, 0x3, 0xFF00, TDDS_B2G3R3A8_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(24, 0xFF0000, 0x00FF00, 0x0000FF, 0x0, TDDS_B8G8R8_UNORM);
+		TINYDDS_CHK_DDSFORMAT(24, 0x0000FF, 0x00FF00, 0xFF0000, 0x0, TDDS_R8G8B8_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(32, 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000, TDDS_R8G8B8A8_UNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x000000FF, 0x0000FF00, 0x00FF0000, 0x00000000, TDDS_R8G8B8X8_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(32, 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, TDDS_B8G8R8A8_UNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x00FF0000, 0x0000FF00, 0x000000FF, 0x00000000, TDDS_B8G8R8X8_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(32, 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF, TDDS_A8B8G8R8_UNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0xFF000000, 0x00FF0000, 0x0000FF00, 0x00000000, TDDS_X8B8G8R8_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(32, 0x0000FF00, 0x00FF0000, 0xFF000000, 0x000000FF, TDDS_A8R8G8B8_UNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x0000FF00, 0x00FF0000, 0xFF000000, 0x00000000, TDDS_X8R8G8B8_UNORM);
+
+		TINYDDS_CHK_DDSFORMAT(32, 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000, TDDS_R10G10B10A2_UNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0xFFC00000, 0x003FF000, 0x00000FFC, 0x00000003, TDDS_A2B10G10R10_UNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x00000FFC, 0x003FF000, 0xFFC00000, 0x00000003, TDDS_A2R10G10B10_UNORM);
+
+		// this is often written incorrectly so we use the most 'common' version
+		TINYDDS_CHK_DDSFORMAT(32, 0x3FF00000, 0x000FFC00, 0x000003FF, 0xC0000000, TDDS_B10G10R10A2_UNORM);
+
+
+		TINYDDS_CHK_DDSFORMAT(32, 0xFFFF0000, 0x0000FFFF, 0x00000000, 0x00000000, TDDS_G16R16_UNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000, TDDS_R16G16_UNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, TDDS_R32_UINT);
+
+		if (ctx->header.formatRGBBitCount == 8) return TDDS_R8_UINT;
+		if (ctx->header.formatRGBBitCount == 16) return TDDS_R16_UINT;
+		if (ctx->header.formatRGBBitCount == 32) return TDDS_R32_UINT;
+	}
+
+	if ((ctx->header.formatFlags & TINYDDS_DDPF_BUMPDUDV) ||
+			(ctx->header.formatFlags & TINYDDS_DDPF_BUMPLUMINANCE)) {
+		TINYDDS_CHK_DDSFORMAT(16, 0xFF00, 0x00FF, 0x0000, 0x0000, TDDS_G8R8_SNORM);
+		TINYDDS_CHK_DDSFORMAT(16, 0x00FF, 0xFF00, 0x0000, 0x0000, TDDS_R8G8_SNORM);
+
+		TINYDDS_CHK_DDSFORMAT(32, 0xFFFF0000, 0x0000FFFF, 0x0000, 0x0, TDDS_G16R16_SNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x0000FFFF, 0xFFFF0000, 0x0000, 0x0, TDDS_R16G16_SNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000, TDDS_R8G8B8A8_SNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000, TDDS_R10G10B10A2_SNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x3FF00000, 0x000FFC00, 0x000003FF, 0xC0000000, TDDS_B10G10R10A2_SNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0x00000FFC, 0x003FF000, 0xFFC00000, 0x00000003, TDDS_A2R10G10B10_SNORM);
+		TINYDDS_CHK_DDSFORMAT(32, 0xFFC00000, 0x003FF000, 0x00000FFC, 0x00000003, TDDS_A2B10G10R10_SNORM);
+
+		if (ctx->header.formatRGBBitCount == 8) return TDDS_R8_SINT;
+		if (ctx->header.formatRGBBitCount == 16) return TDDS_R16_SINT;
+		if (ctx->header.formatRGBBitCount == 32) return TDDS_R32_SINT;
+	}
+
+	if (ctx->header.formatFlags & TINYDDS_DDPF_LUMINANCE) {
+		TINYDDS_CHK_DDSFORMAT(8, 0x0F, 0x00, 0x00, 0xF0, TDDS_R4G4_UNORM); // this is A4L4 aka A4R4 we decode this as R4G4
+		TINYDDS_CHK_DDSFORMAT(16, 0x00FF, 0x0000, 0x0000, 0xFF00, TDDS_R8G8_UNORM); // this is A8L8 aka A4R8 we decode this as R8G8
+
+		if (ctx->header.formatRGBBitCount == 8) return TDDS_R8_UNORM;
+		if (ctx->header.formatRGBBitCount == 16) return TDDS_R16_UNORM;
+		if (ctx->header.formatRGBBitCount == 32) return TDDS_R32_UINT;
+
+	}
+
+	return TDDS_UNDEFINED;
+}
+#undef TINYDDS_CHK_DDSFORMAT
+
+static uint32_t TinyDDS_MipMapReduce(uint32_t value, uint32_t mipmaplevel) {
+
+	// handle 0 being passed in
+	if (value <= 1)
+		return 1;
+
+	// there are better ways of doing this (log2 etc.) but this doesn't require any
+	// dependecies and isn't used enough to matter imho
+	for (uint32_t i = 0u; i < mipmaplevel; ++i) {
+		if (value <= 1)
+			return 1;
+		value = value / 2;
+	}
+	return value;
+}
+
+bool TinyDDS_ReadHeader(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return false;
+
+	ctx->headerPos = ctx->callbacks.tellFn(ctx->user);
+	if( ctx->callbacks.readFn(ctx->user, &ctx->header, sizeof(TinyDDS_Header)) != sizeof(TinyDDS_Header)) {
+		ctx->callbacks.errorFn(ctx->user, "Could not read DDS header");
+		return false;
+	}
+
+	// try the easy case of a modern dx10 DDS file
+	if ((ctx->header.formatFlags & TINYDDS_DDPF_FOURCC) &&
+			(ctx->header.formatFourCC == TINYDDS_MAKE_RIFFCODE('D', 'X', '1', '0'))) {
+		ctx->callbacks.readFn(ctx->user, &ctx->headerDx10, sizeof(TinyDDS_HeaderDX10));
+
+		if (ctx->headerDx10.DXGIFormat >= TDDS_SYNTHESISED_DXGIFORMATS) {
+			ctx->callbacks.errorFn(ctx->user, "DX10 Header has an invalid DXGI_FORMAT code");
+			return false;
+		}
+	}
+
+	ctx->format = TinyDDS_DecodeFormat(ctx);
+	if (ctx->format == TDDS_UNDEFINED) {
+		ctx->callbacks.errorFn(ctx->user, "Could not decode DDS format");
+		return false;
+	}
+
+	if(	(ctx->header.formatFourCC == 0) &&
+			(ctx->header.formatRGBBitCount != 0) &&
+			((ctx->header.formatRGBBitCount/8) != TinyDDS_FormatSize(ctx->format))) {
+		ctx->callbacks.errorFn(ctx->user, "Format size mismatch");
+		return false;
+	}
+
+	// correct for dodgy mipmap levels counts
+	if(ctx->header.mipMapCount > 1) {
+		uint32_t w = ctx->header.width;
+		uint32_t h = ctx->header.height;
+
+		for(uint32_t i = 0; i < ctx->header.mipMapCount;++i) {
+			if (TinyDDS_IsCompressed(ctx->format)) {
+				if (w <= 4 || h <= 4) {
+					ctx->header.mipMapCount = i + 1;
+					break;
+				}
+			} else if (w <= 1 || h <= 1) {
+				ctx->header.mipMapCount = i + 1;
+				break;
+			}
+
+
+			w = w / 2;
+			h = h / 2;
+		}
+
+	}
+
+	if (TinyDDS_IsCompressed(ctx->format)) {
+		// compressed images never get asked to make mip maps which is good as
+		// requires decompress/compress cycle
+		if(ctx->header.mipMapCount == 0) ctx->header.mipMapCount = 1;
+	}
+
+	if(TinyDDS_IsCLUT(ctx->format)) {
+		// for now don't ask to generate mipmaps for cluts
+		if(ctx->header.mipMapCount == 0) ctx->header.mipMapCount = 1;
+
+		size_t const clutSize = 256 * sizeof(uint32_t);
+
+		ctx->clut = (uint32_t*) ctx->callbacks.allocFn(ctx->user, clutSize);
+
+		if( ctx->callbacks.readFn(ctx->user, (void*)ctx->clut, clutSize) != clutSize) {
+			ctx->callbacks.errorFn(ctx->user, "Could not read DDS CLUT");
+			return false;
+		}
+	}
+
+	ctx->firstImagePos = ctx->callbacks.tellFn(ctx->user);
+	ctx->headerValid = true;
+	return true;
+}
+
+bool TinyDDS_IsCubemap(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return false;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return false;
+	}
+
+	return (ctx->header.caps2 & TINYDDS_DDSCAPS2_CUBEMAP);
+}
+
+bool TinyDDS_Dimensions(TinyDDS_ContextHandle handle,
+												uint32_t *width,
+												uint32_t *height,
+												uint32_t *depth,
+												uint32_t *slices) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return false;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return false;
+	}
+
+	if (width)
+		*width = ctx->header.width;
+	if (height)
+		*height = ctx->header.height;
+	if (depth)
+		*depth = ctx->header.depth;
+	if (slices)
+		*slices = ctx->headerDx10.arraySize;
+	return true;
+}
+
+uint32_t TinyDDS_Width(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return 0;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return 0;
+	}
+	return ctx->header.width;
+}
+
+uint32_t TinyDDS_Height(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return 0;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return 0;
+	}
+	return ctx->header.height;
+}
+
+uint32_t TinyDDS_Depth(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return 0;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return 0;
+	}
+
+	return ctx->header.depth;
+}
+
+uint32_t TinyDDS_ArraySlices(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return 0;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return 0;
+	}
+
+	return ctx->headerDx10.arraySize;
+}
+
+bool TinyDDS_Is1D(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return false;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return false;
+	}
+	return (ctx->header.height <= 1 && ctx->header.depth <= 1);
+}
+bool TinyDDS_Is2D(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return false;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return false;
+	}
+	return (ctx->header.height > 1 && ctx->header.depth <= 1);
+}
+bool TinyDDS_Is3D(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return false;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return false;
+	}
+
+	return (ctx->header.height > 1 && ctx->header.depth > 1);
+}
+
+bool TinyDDS_IsArray(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return false;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return false;
+	}
+
+	return (ctx->headerDx10.arraySize >= 1);
+}
+
+uint32_t TinyDDS_NumberOfMipmaps(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return 0;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return 0;
+	}
+
+	return ctx->header.mipMapCount ? ctx->header.mipMapCount : 1;
+}
+
+bool TinyDDS_NeedsGenerationOfMipmaps(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return false;
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return false;
+	}
+
+	return ctx->header.mipMapCount == 0;
+}
+
+bool TinyDDS_NeedsEndianCorrecting(TinyDDS_ContextHandle handle) {
+	// TODO should return true if this file is compiled on big endian machines
+	BASISU_NOTE_UNUSED(handle);
+	return false;
+}
+
+uint32_t TinyDDS_FaceSize(TinyDDS_ContextHandle handle, uint32_t mipmaplevel) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return 0;
+
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return 0;
+	}
+	uint32_t w = TinyDDS_MipMapReduce(ctx->header.width, mipmaplevel);
+	uint32_t h = TinyDDS_MipMapReduce(ctx->header.height, mipmaplevel);
+	uint32_t d = TinyDDS_MipMapReduce(ctx->header.depth, mipmaplevel);
+	uint32_t s = ctx->headerDx10.arraySize ? ctx->headerDx10.arraySize : 1;
+
+	if(d > 1 && s > 1) {
+		ctx->callbacks.errorFn(ctx->user, "Volume textures can't have array slices or be cubemap");
+		return 0;
+	}
+
+	if (TinyDDS_IsCompressed(ctx->format)) {
+		// padd to block boundaries
+		w = (w + 3) / 4;
+		h = (h + 3) / 4;
+	}
+	// 1 bit special case
+	if(ctx->format == TDDS_R1_UNORM) {
+		w = (w + 7) / 8;
+	}
+
+	uint32_t const formatSize = TinyDDS_FormatSize(ctx->format);
+	return w * h * d * s * formatSize;
+}
+
+uint32_t TinyDDS_ImageSize(TinyDDS_ContextHandle handle, uint32_t mipmaplevel) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return 0;
+
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return 0;
+	}
+
+	if(	ctx->header.caps2 & TINYDDS_DDSCAPS2_CUBEMAP ||
+			ctx->headerDx10.miscFlag & TINYDDS_D3D10_RESOURCE_MISC_TEXTURECUBE ) {
+		return TinyDDS_FaceSize(handle, mipmaplevel) * 6;
+	} else {
+		return TinyDDS_FaceSize(handle, mipmaplevel);
+	}
+}
+
+void const *TinyDDS_ImageRawData(TinyDDS_ContextHandle handle, uint32_t mipmaplevel) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return NULL;
+
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return NULL;
+	}
+
+	if (mipmaplevel >= (ctx->header.mipMapCount ? ctx->header.mipMapCount : 1) ) {
+		ctx->callbacks.errorFn(ctx->user, "Invalid mipmap level");
+		return NULL;
+	}
+
+	if (mipmaplevel >= TINYDDS_MAX_MIPMAPLEVELS) {
+		ctx->callbacks.errorFn(ctx->user, "Invalid mipmap level");
+		return NULL;
+	}
+
+	if (ctx->mipmaps[mipmaplevel] != NULL)
+		return ctx->mipmaps[mipmaplevel];
+
+	if(	ctx->header.caps2 & TINYDDS_DDSCAPS2_CUBEMAP ||
+			ctx->headerDx10.miscFlag & TINYDDS_D3D10_RESOURCE_MISC_TEXTURECUBE ) {
+
+		uint64_t offset = 0;
+		for(uint32_t i=0;i < mipmaplevel;++i) {
+			offset += TinyDDS_FaceSize(handle, i);
+		}
+
+		uint32_t mipMapCount = ctx->header.mipMapCount;
+		if(mipMapCount == 0) mipMapCount = 1;
+
+		// at least one cubemap generater has mipMapCount wrong which causes
+		// image artifacts :(
+		uint64_t nextFaceOffset = 0;
+		for(uint32_t i = 0;i < mipMapCount;++i) {
+			nextFaceOffset += TinyDDS_FaceSize(handle, i);
+		}
+
+		size_t const faceSize = TinyDDS_FaceSize(handle, mipmaplevel);
+		ctx->mipmaps[mipmaplevel] = (uint8_t const *) ctx->callbacks.allocFn(ctx->user, faceSize * 6);
+		if(!ctx->mipmaps[mipmaplevel]) return NULL;
+
+		uint8_t *dstPtr = (uint8_t*)ctx->mipmaps[mipmaplevel];
+		for (uint32_t i = 0u;i < 6;++i) {
+			ctx->callbacks.seekFn(ctx->user, offset + ctx->firstImagePos);
+			size_t read = ctx->callbacks.readFn(ctx->user, (void *) dstPtr, faceSize);
+			if(read != faceSize) {
+				ctx->callbacks.freeFn(ctx->user, (void*)&ctx->mipmaps[mipmaplevel]);
+				return NULL;
+			}
+			dstPtr += faceSize;
+			offset += nextFaceOffset;
+		}
+		return ctx->mipmaps[mipmaplevel];
+	}
+
+	uint64_t offset = 0;
+	for(uint32_t i=0;i < mipmaplevel;++i) {
+		offset += TinyDDS_ImageSize(handle, i);
+	}
+
+	uint32_t size = TinyDDS_ImageSize(handle, mipmaplevel);
+	if (size == 0)
+		return NULL;
+
+	ctx->callbacks.seekFn(ctx->user, offset + ctx->firstImagePos);
+
+	ctx->mipmaps[mipmaplevel] = (uint8_t const *) ctx->callbacks.allocFn(ctx->user, size);
+	if (!ctx->mipmaps[mipmaplevel]) return NULL;
+	size_t read = ctx->callbacks.readFn(ctx->user, (void *) ctx->mipmaps[mipmaplevel], size);
+	if(read != size) {
+		ctx->callbacks.freeFn(ctx->user, (void*)&ctx->mipmaps[mipmaplevel]);
+		return NULL;
+	}
+
+	return ctx->mipmaps[mipmaplevel];
+}
+
+TinyDDS_Format TinyDDS_GetFormat(TinyDDS_ContextHandle handle) {
+	TinyDDS_Context *ctx = (TinyDDS_Context *) handle;
+	if (ctx == NULL)
+		return TDDS_UNDEFINED;
+
+	if (!ctx->headerValid) {
+		ctx->callbacks.errorFn(ctx->user, "Header data hasn't been read yet or its invalid");
+		return TDDS_UNDEFINED;
+	}
+	return ctx->format;
+}
+
+#define TDDS_EF(bits, rm, gm, bm, am, fl) \
+		header->formatRGBBitCount = bits; \
+		header->formatRBitMask = rm; \
+		header->formatGBitMask = gm; \
+		header->formatBBitMask = bm; \
+		header->formatABitMask = am; \
+		header->formatFlags = fl; \
+		header->formatFourCC = 0; \
+		return true;
+
+#define TDDS_EF_RGB(bits, rm, gm, bm) TDDS_EF(bits, rm, gm, bm, 0, TINYDDS_DDPF_RGB )
+#define TDDS_EF_RGBA(bits, rm, gm, bm, am) TDDS_EF(bits, rm, gm, bm, am, TINYDDS_DDPF_RGB | TINYDDS_DDPF_ALPHAPIXELS)
+#define TDDS_EF_ALPHA(bits, am) TDDS_EF(bits, 0, 0, 0, am, TINYDDS_DDPF_ALPHA)
+
+#define TDDS_EF_BUMP_RG(bits, rm, gm) TDDS_EF(bits, rm, gm, 0, 0, TINYDDS_DDPF_BUMPDUDV)
+#define TDDS_EF_BUMP_RGB(bits, rm, gm, bm) TDDS_EF(bits, rm, gm, bm, 0, TINYDDS_DDPF_BUMPLUMINANCE)
+#define TDDS_EF_BUMP_RGBA(bits, rm, gm, bm, am) TDDS_EF(bits, rm, gm, bm, am, TINYDDS_DDPF_BUMPLUMINANCE | TINYDDS_DDPF_ALPHAPIXELS)
+
+static bool TinyDDS_EncodeFormat(TinyDDS_Format fmt, TinyDDS_Header* header, TinyDDS_HeaderDX10* headerDx10) {
+	// lets start with the easy part. if its real DXGI_FORMAT we can just fill in the Dx10 part
+	if(fmt < TDDS_SYNTHESISED_DXGIFORMATS) {
+		headerDx10->DXGIFormat = (TinyImageFormat_DXGI_FORMAT)fmt;
+		header->formatFourCC = TINYDDS_MAKE_RIFFCODE('D','X','1','0');
+		header->formatFlags = TINYDDS_DDPF_FOURCC;
+	} else {
+		headerDx10->DXGIFormat = TIF_DXGI_FORMAT_UNKNOWN;
+	}
+	// now lets try synthesising if possible
+	// if we can reset the DX10 fourCC but leave the format in place
+	// that way if we have slices which can only be DXGI_FORMAT we can use it
+	switch(fmt) {
+	case TDDS_UNDEFINED: break;
+
+	case TDDS_R1_UNORM: TDDS_EF_RGB(1, 0x1, 0, 0)
+	case TDDS_R4G4_UNORM: TDDS_EF_RGB(8, 0x0F, 0xF0, 0)
+	case TDDS_G4R4_UNORM: TDDS_EF_RGB(8, 0xF0, 0x0F, 0)
+	case TDDS_B2G3R3_UNORM: TDDS_EF_RGB(8, 0x3, 0x7, 0x7 )
+	case TDDS_R8_UNORM: TDDS_EF_RGB(8, 0xFF, 0, 0 );
+	case TDDS_A8_UNORM: TDDS_EF_ALPHA( 8, 0xFF);
+
+	case TDDS_R16_UNORM:TDDS_EF_RGB( 16,0x0000FFFF, 0, 0)
+	case TDDS_A4B4G4R4_UNORM:
+		TDDS_EF_RGBA(16, 0xF000, 0x0F00, 0x00F0, 0x000F);
+	case TDDS_X4B4G4R4_UNORM:
+		TDDS_EF_RGBA(16, 0xF000, 0x0F00, 0x00F0, 0x0000);
+	case TDDS_B4G4R4A4_UNORM:
+		TDDS_EF_RGBA(16, 0x0F00, 0x00F0, 0x000F, 0xF000);
+	case TDDS_B4G4R4X4_UNORM:
+		TDDS_EF_RGBA(16, 0x0F00, 0x00F0, 0x000F, 0x0000);
+	case TDDS_A4R4G4B4_UNORM:
+		TDDS_EF_RGBA(16, 0x00F0, 0x0F00, 0xF000, 0x000F);
+	case TDDS_X4R4G4B4_UNORM:
+		TDDS_EF_RGBA(16, 0x00F0, 0x0F00, 0xF000, 0x0000);
+	case TDDS_R4G4B4A4_UNORM:
+		TDDS_EF_RGBA(16, 0x000F, 0x00F0, 0x0F00, 0xF000);
+	case TDDS_R4G4B4X4_UNORM:
+		TDDS_EF_RGBA(16, 0x000F, 0x00F0, 0x0F00, 0x0000);
+
+	case TDDS_B5G5R5A1_UNORM:
+		TDDS_EF_RGBA(16, 0x7C00, 0x03E0, 0x001F, 0x8000);
+	case TDDS_B5G5R5X1_UNORM:
+		TDDS_EF_RGBA(16, 0x7C00, 0x03E0, 0x001F, 0x0000);
+
+	case TDDS_R5G5B5A1_UNORM:
+	TDDS_EF_RGBA(16, 0x001F, 0x03E0, 0x7C00, 0x8000);
+	case TDDS_R5G5B5X1_UNORM:
+	TDDS_EF_RGBA(16, 0x001F, 0x03E0, 0x7C00, 0x0000);
+
+	case TDDS_A1R5G5B5_UNORM:
+		TDDS_EF_RGBA(16, 0x003E, 0x07C0, 0xF800, 0x0001);
+	case TDDS_X1R5G5B5_UNORM:
+		TDDS_EF_RGBA(16, 0x003E, 0x07C0, 0xF800, 0x0000);
+	case TDDS_A1B5G5R5_UNORM:
+		TDDS_EF_RGBA(16, 0xF800, 0x07C0, 0x003E, 0x0001);
+	case TDDS_X1B5G5R5_UNORM:
+		TDDS_EF_RGBA(16, 0xF800, 0x07C0, 0x003E, 0x0000);
+
+	case TDDS_B5G6R5_UNORM:
+		TDDS_EF_RGB(16, 0xF800, 0x07E0, 0x001F);
+	case TDDS_R5G6B5_UNORM:
+		TDDS_EF_RGB(16, 0x001F, 0x07E0, 0xF800);
+
+	case TDDS_R8G8_UNORM:
+		TDDS_EF_RGB(16, 0x00FF, 0xFF00, 0);
+	case TDDS_G8R8_UNORM:
+		TDDS_EF_RGB(16, 0xFF00, 0x00FF, 0);
+	case TDDS_G8R8_SNORM:
+		TDDS_EF_BUMP_RG(16, 0xFF00, 0x00FF);
+
+	case TDDS_B2G3R3A8_UNORM: TDDS_EF_RGBA(8, 0x3, 0x7, 0x7, 0xFF00 )
+
+	case TDDS_R8G8B8_UNORM:
+	TDDS_EF_RGB( 24,0x000000FF, 0x0000FF00, 0x00FF0000)
+	case TDDS_B8G8R8_UNORM:
+	TDDS_EF_RGB( 24,0x00FF0000, 0x0000FF00, 0x000000FF)
+
+	case TDDS_R8G8B8A8_UNORM:
+	TDDS_EF_RGBA( 32,0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000)
+	case TDDS_R8G8B8X8_UNORM:
+	TDDS_EF_RGBA( 32,0x000000FF, 0x0000FF00, 0x00FF0000, 0x00000000)
+	case TDDS_B8G8R8A8_UNORM:
+	TDDS_EF_RGBA( 32,0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000)
+	case TDDS_B8G8R8X8_UNORM:
+	TDDS_EF_RGBA( 32,0x00FF0000, 0x0000FF00, 0x000000FF, 0x00000000)
+	case TDDS_A8B8G8R8_UNORM:
+	TDDS_EF_RGBA( 32,0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF)
+	case TDDS_X8B8G8R8_UNORM:
+	TDDS_EF_RGBA( 32,0xFF000000, 0x00FF0000, 0x0000FF00, 0x00000000)
+	case TDDS_A8R8G8B8_UNORM:
+	TDDS_EF_RGBA( 32,0x0000FF00, 0x00FF0000, 0xFF000000, 0x000000FF)
+	case TDDS_X8R8G8B8_UNORM:
+	TDDS_EF_RGBA( 32,0x0000FF00, 0x00FF0000, 0xFF000000, 0x00000000)
+
+	/* A2R10G10B10 is broken via the traditional DDS descriptions, so we
+	 * always use the Dx10 header for those
+	case TDDS_R10G10B10A2_UNORM:
+	TDDS_EF_RGBA( 32,0x3FF00000, 0x000FFC00, 0x000003FF, 0xC0000000)
+	case TDDS_A2B10G10R10_UNORM:
+	TDDS_EF_RGBA( 32,0xFFC00000, 0x003FF000, 0x00000FFC, 0x00000003)
+	case TDDS_A2R10G10B10_UNORM:
+	TDDS_EF_RGBA( 32,0x00000FFC, 0x003FF000, 0xFFC00000, 0x00000003)
+	case TDDS_B10G10R10A2_UNORM:
+	TDDS_EF_RGBA( 32,0x3FF00000, 0x000FFC00, 0x000003FF, 0xC0000000)
+	*/
+	case TDDS_R10G10B10A2_UNORM:
+	case TDDS_B10G10R10A2_UNORM:
+	case TDDS_A2B10G10R10_UNORM:
+	case TDDS_A2R10G10B10_UNORM:
+	case TDDS_R10G10B10A2_SNORM:
+	case TDDS_B10G10R10A2_SNORM:
+	case TDDS_A2B10G10R10_SNORM:
+	case TDDS_A2R10G10B10_SNORM:
+		break;
+
+	case TDDS_R16G16_UNORM: TDDS_EF_RGB( 32,0x0000FFFF, 0xFFFF0000, 0)
+	case TDDS_G16R16_UNORM: TDDS_EF_RGB( 32,0xFFFF0000, 0x0000FFFF, 0)
+
+	case TDDS_BC1_RGBA_UNORM_BLOCK:
+		header->formatFourCC = TINYDDS_MAKE_RIFFCODE('D','X','T','1');
+		header->formatFlags = TINYDDS_DDPF_FOURCC;
+		return true;
+	case TDDS_BC2_UNORM_BLOCK:
+		header->formatFourCC = TINYDDS_MAKE_RIFFCODE('D','X','T','3');
+		header->formatFlags = TINYDDS_DDPF_FOURCC;
+		return true;
+	case TDDS_BC3_UNORM_BLOCK:
+		header->formatFourCC = TINYDDS_MAKE_RIFFCODE('D','X','T','5');
+		header->formatFlags = TINYDDS_DDPF_FOURCC;
+		return true;
+	case TDDS_BC4_UNORM_BLOCK:
+		header->formatFourCC = TINYDDS_MAKE_RIFFCODE('A','T','I','1');
+		header->formatFlags = TINYDDS_DDPF_FOURCC;
+		return true;
+	case TDDS_BC5_UNORM_BLOCK:
+		header->formatFourCC = TINYDDS_MAKE_RIFFCODE('A','T','I','2');
+		header->formatFlags = TINYDDS_DDPF_FOURCC;
+		return true;
+
+
+	case TDDS_R8_SNORM:
+	case TDDS_R8G8_SNORM:
+	case TDDS_R8G8B8A8_SNORM:
+	case TDDS_R16_SNORM:
+	case TDDS_R16G16_SNORM:
+	case TDDS_A8B8G8R8_SNORM:
+	case TDDS_B8G8R8A8_SNORM:
+	case TDDS_G16R16_SNORM:
+
+	case TDDS_R8_UINT:
+	case TDDS_R8_SINT:
+	case TDDS_R8G8_UINT:
+	case TDDS_R8G8_SINT:
+	case TDDS_R8G8B8A8_UINT:
+	case TDDS_R8G8B8A8_SINT:
+	case TDDS_R8G8B8A8_SRGB:
+	case TDDS_B8G8R8A8_SRGB:
+	case TDDS_R9G9B9E5_UFLOAT:
+	case TDDS_R10G10B10A2_UINT:
+	case TDDS_R11G11B10_UFLOAT:
+	case TDDS_R16_UINT:
+	case TDDS_R16_SINT:
+	case TDDS_R16_SFLOAT:
+	case TDDS_R16G16_UINT:
+	case TDDS_R16G16_SINT:
+	case TDDS_R16G16_SFLOAT:
+	case TDDS_R16G16B16A16_UNORM:
+	case TDDS_R16G16B16A16_SNORM:
+	case TDDS_R16G16B16A16_UINT:
+	case TDDS_R16G16B16A16_SINT:
+	case TDDS_R16G16B16A16_SFLOAT:
+	case TDDS_R32_UINT:
+	case TDDS_R32_SINT:
+	case TDDS_R32_SFLOAT:
+	case TDDS_R32G32_UINT:
+	case TDDS_R32G32_SINT:
+	case TDDS_R32G32_SFLOAT:
+	case TDDS_R32G32B32_UINT:
+	case TDDS_R32G32B32_SINT:
+	case TDDS_R32G32B32_SFLOAT:
+	case TDDS_R32G32B32A32_UINT:
+	case TDDS_R32G32B32A32_SINT:
+	case TDDS_R32G32B32A32_SFLOAT:
+	case TDDS_BC1_RGBA_SRGB_BLOCK:
+	case TDDS_BC2_SRGB_BLOCK:
+	case TDDS_BC3_SRGB_BLOCK:
+	case TDDS_BC4_SNORM_BLOCK:
+	case TDDS_BC5_SNORM_BLOCK:
+	case TDDS_BC6H_UFLOAT_BLOCK:
+	case TDDS_BC6H_SFLOAT_BLOCK:
+	case TDDS_BC7_UNORM_BLOCK:
+	case TDDS_BC7_SRGB_BLOCK:
+	case TDDS_AYUV:
+	case TDDS_Y410:
+	case TDDS_Y416:
+	case TDDS_NV12:
+	case TDDS_P010:
+	case TDDS_P016:
+	case TDDS_420_OPAQUE:
+	case TDDS_YUY2:
+	case TDDS_Y210:
+	case TDDS_Y216:
+	case TDDS_NV11:
+	case TDDS_AI44:
+	case TDDS_IA44:
+	case TDDS_P8:
+	case TDDS_A8P8:
+	case TDDS_R10G10B10_7E3_A2_FLOAT:
+	case TDDS_R10G10B10_6E4_A2_FLOAT:
+	case TDDS_D16_UNORM_S8_UINT:
+	case TDDS_R16_UNORM_X8_TYPELESS:
+	case TDDS_X16_TYPELESS_G8_UINT:
+	case TDDS_P208:
+	case TDDS_V208:
+	case TDDS_V408:
+	case TDDS_R10G10B10_SNORM_A2_UNORM:
+		break;
+
+	}
+	// these formats can probably be done via dx10 header so check
+	if(headerDx10->DXGIFormat == TIF_DXGI_FORMAT_UNKNOWN) return false;
+	else return true;
+}
+
+#undef TDDS_EF
+#undef TDDS_EF_RGB
+#undef TDDS_EF_RGBA
+#undef TDDS_EF_ALPHA
+
+bool TinyDDS_WriteImage(TinyDDS_WriteCallbacks const *callbacks,
+												void *user,
+												uint32_t width,
+												uint32_t height,
+												uint32_t depth,	 // 3D texture depth
+												uint32_t slices, // Array slices
+												uint32_t mipmaplevels,
+												TinyDDS_Format format,
+												bool cubemap,
+												bool preferDx10Format,
+												uint32_t const *mipmapsizes,
+												void const **mipmaps) {
+	TinyDDS_Header header;
+	TinyDDS_HeaderDX10 headerDX10;
+	memset(&header, 0, sizeof(header));
+	memset(&headerDX10, 0, sizeof(headerDX10));
+
+	header.magic = TINYDDS_MAKE_RIFFCODE('D', 'D', 'S', ' ');
+	header.size = 124;
+	header.formatSize = 32;
+
+	header.width = width;
+	header.height = height;
+	header.depth = (depth > 1) ? depth : 0;
+	header.mipMapCount = mipmaplevels;
+
+	if(!TinyDDS_EncodeFormat(format, &header, &headerDX10)) return false;
+
+	// do we have to force dx10 (for slices)
+	if (slices > 1) {
+		if(headerDX10.DXGIFormat == TIF_DXGI_FORMAT_UNKNOWN) {
+			// DDS doesn't support slices for formats that aren't DXGI compatible
+			return false;
+		}
+		header.formatFlags = TINYDDS_DDPF_FOURCC;
+		header.formatFourCC = TINYDDS_MAKE_RIFFCODE('D','X','1','0');
+		headerDX10.arraySize = slices;
+	}
+	header.flags = TINYDDS_DDSD_CAPS | TINYDDS_DDSD_PIXELFORMAT | TINYDDS_DDSD_MIPMAPCOUNT;
+	header.caps1 = TINYDDS_DDSCAPS_TEXTURE | TINYDDS_DDSCAPS_COMPLEX | TINYDDS_DDSCAPS_MIPMAP;
+
+	if(depth > 1) {
+		headerDX10.resourceDimension = TINYDDS_D3D10_RESOURCE_DIMENSION_TEXTURE3D;
+		header.flags |= TINYDDS_DDSD_DEPTH;
+		header.caps2 |= TINYDDS_DDSCAPS2_VOLUME;
+	}
+	else if(height > 1) {
+		headerDX10.resourceDimension = TINYDDS_D3D10_RESOURCE_DIMENSION_TEXTURE2D;
+		header.flags |= TINYDDS_DDSD_HEIGHT;
+	}
+	else if(width > 1) {
+		headerDX10.resourceDimension = TINYDDS_D3D10_RESOURCE_DIMENSION_TEXTURE1D;
+		header.flags |= TINYDDS_DDSD_WIDTH;
+	}
+	if(cubemap) {
+		headerDX10.miscFlag |= TINYDDS_D3D10_RESOURCE_MISC_TEXTURECUBE;
+		header.caps2 |= TINYDDS_DDSCAPS2_CUBEMAP | TINYDDS_DDSCAPS2_CUBEMAP_ALL;
+	}
+
+	// unclear whether we need to save this or exactly what it should be...
+	header.pitchOrLinearSize = 0;
+	if(preferDx10Format && headerDX10.DXGIFormat != TIF_DXGI_FORMAT_UNKNOWN) {
+		header.formatFlags = TINYDDS_DDPF_FOURCC;
+		header.formatFourCC = TINYDDS_MAKE_RIFFCODE('D','X','1','0');
+	}
+
+	// now write
+	callbacks->write(user, &header, sizeof(TinyDDS_Header));
+	if(header.formatFlags & TINYDDS_DDPF_FOURCC &&
+			header.formatFourCC == TINYDDS_MAKE_RIFFCODE('D','X','1','0')) {
+		callbacks->write(user, &headerDX10, sizeof(TinyDDS_HeaderDX10));
+	}
+
+	// rg 8/27/2024: The original tinydds.h code is wrong for mipmapped cubemaps.
+	// I'm going to work around this by having the caller compose the top mip data correctly.
+	// https://learn.microsoft.com/en-us/windows/win32/direct3ddds/dds-file-layout-for-cubic-environment-maps
+	for (uint32_t mipMapLevel = 0; mipMapLevel < header.mipMapCount; mipMapLevel++) 
+	{
+		// rg: Adding this check, in case the caller wants to compose all the data themselves.
+		if (mipmapsizes[mipMapLevel])
+		{
+			callbacks->write(user, mipmaps[mipMapLevel], mipmapsizes[mipMapLevel]);
+		}
+	}
+	return true;
+}
+
+#endif
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif // end header
+/*
+MIT License
+
+Copyright (c) 2019 DeanoC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
\ No newline at end of file
diff --git a/encoder/3rdparty/tinyexr.cpp b/encoder/3rdparty/tinyexr.cpp
new file mode 100644
index 0000000..5548c5a
--- /dev/null
+++ b/encoder/3rdparty/tinyexr.cpp
@@ -0,0 +1,12 @@
+#if defined(_WIN32)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable:4530) //  warning C4530: C++ exception handler used, but unwind semantics are not enabled. Specify /EHsc
+#endif
+
+#define TINYEXR_IMPLEMENTATION
+#include "tinyexr.h"
diff --git a/encoder/3rdparty/tinyexr.h b/encoder/3rdparty/tinyexr.h
new file mode 100644
index 0000000..a2a065a
--- /dev/null
+++ b/encoder/3rdparty/tinyexr.h
@@ -0,0 +1,9334 @@
+// rg 8/23/2024: I fixed some minor undefined behavior in this module (signed 32-bit left shifts).
+
+#ifndef TINYEXR_H_
+#define TINYEXR_H_
+/*
+Copyright (c) 2014 - 2021, Syoyo Fujita and many contributors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Syoyo Fujita nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// TinyEXR contains some OpenEXR code, which is licensed under ------------
+
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
+// Digital Ltd. LLC
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// *       Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// *       Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// *       Neither the name of Industrial Light & Magic nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////
+
+// End of OpenEXR license -------------------------------------------------
+
+
+//
+//
+//   Do this:
+//    #define TINYEXR_IMPLEMENTATION
+//   before you include this file in *one* C or C++ file to create the
+//   implementation.
+//
+//   // i.e. it should look like this:
+//   #include ...
+//   #include ...
+//   #include ...
+//   #define TINYEXR_IMPLEMENTATION
+//   #include "tinyexr.h"
+//
+//
+
+#include <stddef.h>  // for size_t
+#include <stdint.h>  // guess stdint.h is available(C99)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+    defined(__i386) || defined(__i486__) || defined(__i486) ||  \
+    defined(i386) || defined(__ia64__) || defined(__x86_64__)
+#define TINYEXR_X86_OR_X64_CPU 1
+#else
+#define TINYEXR_X86_OR_X64_CPU 0
+#endif
+
+#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || TINYEXR_X86_OR_X64_CPU
+#define TINYEXR_LITTLE_ENDIAN 1
+#else
+#define TINYEXR_LITTLE_ENDIAN 0
+#endif
+
+// Use miniz or not to decode ZIP format pixel. Linking with zlib
+// required if this flag is 0 and TINYEXR_USE_STB_ZLIB is 0.
+#ifndef TINYEXR_USE_MINIZ
+#define TINYEXR_USE_MINIZ (1)
+#ifndef MINIZ_HEADER_FILE_ONLY
+#define MINIZ_HEADER_FILE_ONLY (1)
+#endif
+#endif
+
+// Use the ZIP implementation of stb_image.h and stb_image_write.h.
+#ifndef TINYEXR_USE_STB_ZLIB
+#define TINYEXR_USE_STB_ZLIB (0)
+#endif
+
+// Use nanozlib.
+#ifndef TINYEXR_USE_NANOZLIB
+#define TINYEXR_USE_NANOZLIB (0)
+#endif
+
+// Disable PIZ compression when applying cpplint.
+#ifndef TINYEXR_USE_PIZ
+#define TINYEXR_USE_PIZ (1)
+#endif
+
+#ifndef TINYEXR_USE_ZFP
+#define TINYEXR_USE_ZFP (0)  // TinyEXR extension.
+// http://computation.llnl.gov/projects/floating-point-compression
+#endif
+
+#ifndef TINYEXR_USE_THREAD
+#define TINYEXR_USE_THREAD (0)  // No threaded loading.
+// http://computation.llnl.gov/projects/floating-point-compression
+#endif
+
+#ifndef TINYEXR_USE_OPENMP
+#ifdef _OPENMP
+#define TINYEXR_USE_OPENMP (1)
+#else
+#define TINYEXR_USE_OPENMP (0)
+#endif
+#endif
+
+#define TINYEXR_SUCCESS (0)
+#define TINYEXR_ERROR_INVALID_MAGIC_NUMBER (-1)
+#define TINYEXR_ERROR_INVALID_EXR_VERSION (-2)
+#define TINYEXR_ERROR_INVALID_ARGUMENT (-3)
+#define TINYEXR_ERROR_INVALID_DATA (-4)
+#define TINYEXR_ERROR_INVALID_FILE (-5)
+#define TINYEXR_ERROR_INVALID_PARAMETER (-6)
+#define TINYEXR_ERROR_CANT_OPEN_FILE (-7)
+#define TINYEXR_ERROR_UNSUPPORTED_FORMAT (-8)
+#define TINYEXR_ERROR_INVALID_HEADER (-9)
+#define TINYEXR_ERROR_UNSUPPORTED_FEATURE (-10)
+#define TINYEXR_ERROR_CANT_WRITE_FILE (-11)
+#define TINYEXR_ERROR_SERIALIZATION_FAILED (-12)
+#define TINYEXR_ERROR_LAYER_NOT_FOUND (-13)
+#define TINYEXR_ERROR_DATA_TOO_LARGE (-14)
+
+// @note { OpenEXR file format: http://www.openexr.com/openexrfilelayout.pdf }
+
+// pixel type: possible values are: UINT = 0 HALF = 1 FLOAT = 2
+#define TINYEXR_PIXELTYPE_UINT (0)
+#define TINYEXR_PIXELTYPE_HALF (1)
+#define TINYEXR_PIXELTYPE_FLOAT (2)
+
+#define TINYEXR_MAX_HEADER_ATTRIBUTES (1024)
+#define TINYEXR_MAX_CUSTOM_ATTRIBUTES (128)
+
+#define TINYEXR_COMPRESSIONTYPE_NONE (0)
+#define TINYEXR_COMPRESSIONTYPE_RLE (1)
+#define TINYEXR_COMPRESSIONTYPE_ZIPS (2)
+#define TINYEXR_COMPRESSIONTYPE_ZIP (3)
+#define TINYEXR_COMPRESSIONTYPE_PIZ (4)
+#define TINYEXR_COMPRESSIONTYPE_ZFP (128)  // TinyEXR extension
+
+#define TINYEXR_ZFP_COMPRESSIONTYPE_RATE (0)
+#define TINYEXR_ZFP_COMPRESSIONTYPE_PRECISION (1)
+#define TINYEXR_ZFP_COMPRESSIONTYPE_ACCURACY (2)
+
+#define TINYEXR_TILE_ONE_LEVEL (0)
+#define TINYEXR_TILE_MIPMAP_LEVELS (1)
+#define TINYEXR_TILE_RIPMAP_LEVELS (2)
+
+#define TINYEXR_TILE_ROUND_DOWN (0)
+#define TINYEXR_TILE_ROUND_UP (1)
+
+typedef struct TEXRVersion {
+  int version;    // this must be 2
+  // tile format image;
+  // not zero for only a single-part "normal" tiled file (according to spec.)
+  int tiled;
+  int long_name;  // long name attribute
+  // deep image(EXR 2.0);
+  // for a multi-part file, indicates that at least one part is of type deep* (according to spec.)
+  int non_image;
+  int multipart;  // multi-part(EXR 2.0)
+} EXRVersion;
+
+typedef struct TEXRAttribute {
+  char name[256];  // name and type are up to 255 chars long.
+  char type[256];
+  unsigned char *value;  // uint8_t*
+  int size;
+  int pad0;
+} EXRAttribute;
+
+typedef struct TEXRChannelInfo {
+  char name[256];  // less than 255 bytes long
+  int pixel_type;
+  int x_sampling;
+  int y_sampling;
+  unsigned char p_linear;
+  unsigned char pad[3];
+} EXRChannelInfo;
+
+typedef struct TEXRTile {
+  int offset_x;
+  int offset_y;
+  int level_x;
+  int level_y;
+
+  int width;   // actual width in a tile.
+  int height;  // actual height int a tile.
+
+  unsigned char **images;  // image[channels][pixels]
+} EXRTile;
+
+typedef struct TEXRBox2i {
+  int min_x;
+  int min_y;
+  int max_x;
+  int max_y;
+} EXRBox2i;
+
+typedef struct TEXRHeader {
+  float pixel_aspect_ratio;
+  int line_order;
+  EXRBox2i data_window;
+  EXRBox2i display_window;
+  float screen_window_center[2];
+  float screen_window_width;
+
+  int chunk_count;
+
+  // Properties for tiled format(`tiledesc`).
+  int tiled;
+  int tile_size_x;
+  int tile_size_y;
+  int tile_level_mode;
+  int tile_rounding_mode;
+
+  int long_name;
+  // for a single-part file, agree with the version field bit 11
+  // for a multi-part file, it is consistent with the type of part
+  int non_image;
+  int multipart;
+  unsigned int header_len;
+
+  // Custom attributes(exludes required attributes(e.g. `channels`,
+  // `compression`, etc)
+  int num_custom_attributes;
+  EXRAttribute *custom_attributes;  // array of EXRAttribute. size =
+                                    // `num_custom_attributes`.
+
+  EXRChannelInfo *channels;  // [num_channels]
+
+  int *pixel_types;  // Loaded pixel type(TINYEXR_PIXELTYPE_*) of `images` for
+  // each channel. This is overwritten with `requested_pixel_types` when
+  // loading.
+  int num_channels;
+
+  int compression_type;        // compression type(TINYEXR_COMPRESSIONTYPE_*)
+  int *requested_pixel_types;  // Filled initially by
+                               // ParseEXRHeaderFrom(Meomory|File), then users
+                               // can edit it(only valid for HALF pixel type
+                               // channel)
+  // name attribute required for multipart files;
+  // must be unique and non empty (according to spec.);
+  // use EXRSetNameAttr for setting value;
+  // max 255 character allowed - excluding terminating zero
+  char name[256];
+} EXRHeader;
+
+typedef struct TEXRMultiPartHeader {
+  int num_headers;
+  EXRHeader *headers;
+
+} EXRMultiPartHeader;
+
+typedef struct TEXRImage {
+  EXRTile *tiles;  // Tiled pixel data. The application must reconstruct image
+                   // from tiles manually. NULL if scanline format.
+  struct TEXRImage* next_level; // NULL if scanline format or image is the last level.
+  int level_x; // x level index
+  int level_y; // y level index
+
+  unsigned char **images;  // image[channels][pixels]. NULL if tiled format.
+
+  int width;
+  int height;
+  int num_channels;
+
+  // Properties for tile format.
+  int num_tiles;
+
+} EXRImage;
+
+typedef struct TEXRMultiPartImage {
+  int num_images;
+  EXRImage *images;
+
+} EXRMultiPartImage;
+
+typedef struct TDeepImage {
+  const char **channel_names;
+  float ***image;      // image[channels][scanlines][samples]
+  int **offset_table;  // offset_table[scanline][offsets]
+  int num_channels;
+  int width;
+  int height;
+  int pad0;
+} DeepImage;
+
+// @deprecated { For backward compatibility. Not recommended to use. }
+// Loads single-frame OpenEXR image. Assume EXR image contains A(single channel
+// alpha) or RGB(A) channels.
+// Application must free image data as returned by `out_rgba`
+// Result image format is: float x RGBA x width x hight
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int LoadEXR(float **out_rgba, int *width, int *height,
+                   const char *filename, const char **err);
+
+// Loads single-frame OpenEXR image by specifying layer name. Assume EXR image
+// contains A(single channel alpha) or RGB(A) channels. Application must free
+// image data as returned by `out_rgba` Result image format is: float x RGBA x
+// width x hight Returns negative value and may set error string in `err` when
+// there's an error When the specified layer name is not found in the EXR file,
+// the function will return `TINYEXR_ERROR_LAYER_NOT_FOUND`.
+extern int LoadEXRWithLayer(float **out_rgba, int *width, int *height,
+                            const char *filename, const char *layer_name,
+                            const char **err, int *num_chans = NULL);
+
+//
+// Get layer infos from EXR file.
+//
+// @param[out] layer_names List of layer names. Application must free memory
+// after using this.
+// @param[out] num_layers The number of layers
+// @param[out] err Error string(will be filled when the function returns error
+// code). Free it using FreeEXRErrorMessage after using this value.
+//
+// @return TINYEXR_SUCCEES upon success.
+//
+extern int EXRLayers(const char *filename, const char **layer_names[],
+                     int *num_layers, const char **err);
+
+// @deprecated
+// Simple wrapper API for ParseEXRHeaderFromFile.
+// checking given file is a EXR file(by just look up header)
+// @return TINYEXR_SUCCEES for EXR image, TINYEXR_ERROR_INVALID_HEADER for
+// others
+extern int IsEXR(const char *filename);
+
+// Simple wrapper API for ParseEXRHeaderFromMemory.
+// Check if given data is a EXR image(by just looking up a header section)
+// @return TINYEXR_SUCCEES for EXR image, TINYEXR_ERROR_INVALID_HEADER for
+// others
+extern int IsEXRFromMemory(const unsigned char *memory, size_t size);
+
+// @deprecated
+// Saves single-frame OpenEXR image to a buffer. Assume EXR image contains RGB(A) channels.
+// components must be 1(Grayscale), 3(RGB) or 4(RGBA).
+// Input image format is: `float x width x height`, or `float x RGB(A) x width x
+// hight`
+// Save image as fp16(HALF) format when `save_as_fp16` is positive non-zero
+// value.
+// Save image as fp32(FLOAT) format when `save_as_fp16` is 0.
+// Use ZIP compression by default.
+// `buffer` is the pointer to write EXR data.
+// Memory for `buffer` is allocated internally in SaveEXRToMemory.
+// Returns the data size of EXR file when the value is positive(up to 2GB EXR data).
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int SaveEXRToMemory(const float *data, const int width, const int height,
+                   const int components, const int save_as_fp16,
+                   const unsigned char **buffer, const char **err);
+
+// @deprecated { Not recommended, but handy to use. }
+// Saves single-frame OpenEXR image to a buffer. Assume EXR image contains RGB(A) channels.
+// components must be 1(Grayscale), 3(RGB) or 4(RGBA).
+// Input image format is: `float x width x height`, or `float x RGB(A) x width x
+// hight`
+// Save image as fp16(HALF) format when `save_as_fp16` is positive non-zero
+// value.
+// Save image as fp32(FLOAT) format when `save_as_fp16` is 0.
+// Use ZIP compression by default.
+// Returns TINYEXR_SUCCEES(0) when success.
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int SaveEXR(const float *data, const int width, const int height,
+                   const int components, const int save_as_fp16,
+                   const char *filename, const char **err);
+
+// Returns the number of resolution levels of the image (including the base)
+extern int EXRNumLevels(const EXRImage* exr_image);
+
+// Initialize EXRHeader struct
+extern void InitEXRHeader(EXRHeader *exr_header);
+
+// Set name attribute of EXRHeader struct (it makes a copy)
+extern void EXRSetNameAttr(EXRHeader *exr_header, const char* name);
+
+// Initialize EXRImage struct
+extern void InitEXRImage(EXRImage *exr_image);
+
+// Frees internal data of EXRHeader struct
+extern int FreeEXRHeader(EXRHeader *exr_header);
+
+// Frees internal data of EXRImage struct
+extern int FreeEXRImage(EXRImage *exr_image);
+
+// Frees error message
+extern void FreeEXRErrorMessage(const char *msg);
+
+// Parse EXR version header of a file.
+extern int ParseEXRVersionFromFile(EXRVersion *version, const char *filename);
+
+// Parse EXR version header from memory-mapped EXR data.
+extern int ParseEXRVersionFromMemory(EXRVersion *version,
+                                     const unsigned char *memory, size_t size);
+
+// Parse single-part OpenEXR header from a file and initialize `EXRHeader`.
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int ParseEXRHeaderFromFile(EXRHeader *header, const EXRVersion *version,
+                                  const char *filename, const char **err);
+
+// Parse single-part OpenEXR header from a memory and initialize `EXRHeader`.
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int ParseEXRHeaderFromMemory(EXRHeader *header,
+                                    const EXRVersion *version,
+                                    const unsigned char *memory, size_t size,
+                                    const char **err);
+
+// Parse multi-part OpenEXR headers from a file and initialize `EXRHeader*`
+// array.
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int ParseEXRMultipartHeaderFromFile(EXRHeader ***headers,
+                                           int *num_headers,
+                                           const EXRVersion *version,
+                                           const char *filename,
+                                           const char **err);
+
+// Parse multi-part OpenEXR headers from a memory and initialize `EXRHeader*`
+// array
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int ParseEXRMultipartHeaderFromMemory(EXRHeader ***headers,
+                                             int *num_headers,
+                                             const EXRVersion *version,
+                                             const unsigned char *memory,
+                                             size_t size, const char **err);
+
+// Loads single-part OpenEXR image from a file.
+// Application must setup `ParseEXRHeaderFromFile` before calling this function.
+// Application can free EXRImage using `FreeEXRImage`
+// Returns negative value and may set error string in `err` when there's an
+// error
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int LoadEXRImageFromFile(EXRImage *image, const EXRHeader *header,
+                                const char *filename, const char **err);
+
+// Loads single-part OpenEXR image from a memory.
+// Application must setup `EXRHeader` with
+// `ParseEXRHeaderFromMemory` before calling this function.
+// Application can free EXRImage using `FreeEXRImage`
+// Returns negative value and may set error string in `err` when there's an
+// error
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int LoadEXRImageFromMemory(EXRImage *image, const EXRHeader *header,
+                                  const unsigned char *memory,
+                                  const size_t size, const char **err);
+
+// Loads multi-part OpenEXR image from a file.
+// Application must setup `ParseEXRMultipartHeaderFromFile` before calling this
+// function.
+// Application can free EXRImage using `FreeEXRImage`
+// Returns negative value and may set error string in `err` when there's an
+// error
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int LoadEXRMultipartImageFromFile(EXRImage *images,
+                                         const EXRHeader **headers,
+                                         unsigned int num_parts,
+                                         const char *filename,
+                                         const char **err);
+
+// Loads multi-part OpenEXR image from a memory.
+// Application must setup `EXRHeader*` array with
+// `ParseEXRMultipartHeaderFromMemory` before calling this function.
+// Application can free EXRImage using `FreeEXRImage`
+// Returns negative value and may set error string in `err` when there's an
+// error
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int LoadEXRMultipartImageFromMemory(EXRImage *images,
+                                           const EXRHeader **headers,
+                                           unsigned int num_parts,
+                                           const unsigned char *memory,
+                                           const size_t size, const char **err);
+
+// Saves multi-channel, single-frame OpenEXR image to a file.
+// Returns negative value and may set error string in `err` when there's an
+// error
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int SaveEXRImageToFile(const EXRImage *image,
+                              const EXRHeader *exr_header, const char *filename,
+                              const char **err);
+
+// Saves multi-channel, single-frame OpenEXR image to a memory.
+// Image is compressed using EXRImage.compression value.
+// Return the number of bytes if success.
+// Return zero and will set error string in `err` when there's an
+// error.
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern size_t SaveEXRImageToMemory(const EXRImage *image,
+                                   const EXRHeader *exr_header,
+                                   unsigned char **memory, const char **err);
+
+// Saves multi-channel, multi-frame OpenEXR image to a memory.
+// Image is compressed using EXRImage.compression value.
+// File global attributes (eg. display_window) must be set in the first header.
+// Returns negative value and may set error string in `err` when there's an
+// error
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int SaveEXRMultipartImageToFile(const EXRImage *images,
+                                       const EXRHeader **exr_headers,
+                                       unsigned int num_parts,
+                                       const char *filename, const char **err);
+
+// Saves multi-channel, multi-frame OpenEXR image to a memory.
+// Image is compressed using EXRImage.compression value.
+// File global attributes (eg. display_window) must be set in the first header.
+// Return the number of bytes if success.
+// Return zero and will set error string in `err` when there's an
+// error.
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern size_t SaveEXRMultipartImageToMemory(const EXRImage *images,
+                                            const EXRHeader **exr_headers,
+                                            unsigned int num_parts,
+                                            unsigned char **memory, const char **err);
+// Loads single-frame OpenEXR deep image.
+// Application must free memory of variables in DeepImage(image, offset_table)
+// Returns negative value and may set error string in `err` when there's an
+// error
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int LoadDeepEXR(DeepImage *out_image, const char *filename,
+                       const char **err);
+
+// NOT YET IMPLEMENTED:
+// Saves single-frame OpenEXR deep image.
+// Returns negative value and may set error string in `err` when there's an
+// error
+// extern int SaveDeepEXR(const DeepImage *in_image, const char *filename,
+//                       const char **err);
+
+// NOT YET IMPLEMENTED:
+// Loads multi-part OpenEXR deep image.
+// Application must free memory of variables in DeepImage(image, offset_table)
+// extern int LoadMultiPartDeepEXR(DeepImage **out_image, int num_parts, const
+// char *filename,
+//                       const char **err);
+
+// For emscripten.
+// Loads single-frame OpenEXR image from memory. Assume EXR image contains
+// RGB(A) channels.
+// Returns negative value and may set error string in `err` when there's an
+// error
+// When there was an error message, Application must free `err` with
+// FreeEXRErrorMessage()
+extern int LoadEXRFromMemory(float **out_rgba, int *width, int *height,
+                             const unsigned char *memory, size_t size,
+                             const char **err);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TINYEXR_H_
+
+#ifdef TINYEXR_IMPLEMENTATION
+#ifndef TINYEXR_IMPLEMENTATION_DEFINED
+#define TINYEXR_IMPLEMENTATION_DEFINED
+
+#ifdef _WIN32
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>  // for UTF-8 and memory-mapping
+
+#if !defined(WINAPI_FAMILY) || (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP)
+#define TINYEXR_USE_WIN32_MMAP (1)
+#endif
+
+#elif defined(__linux__) || defined(__unix__)
+#include <fcntl.h>     // for open()
+#include <sys/mman.h>  // for memory-mapping
+#include <sys/stat.h>  // for stat
+#include <unistd.h>    // for close()
+#define TINYEXR_USE_POSIX_MMAP (1)
+#endif
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+
+//#include <iostream> // debug
+
+#include <limits>
+#include <string>
+#include <vector>
+#include <set>
+
+// https://stackoverflow.com/questions/5047971/how-do-i-check-for-c11-support
+#if __cplusplus > 199711L || (defined(_MSC_VER) && _MSC_VER >= 1900)
+#define TINYEXR_HAS_CXX11 (1)
+// C++11
+#include <cstdint>
+
+#if TINYEXR_USE_THREAD
+#include <atomic>
+#include <thread>
+#endif
+
+#else  // __cplusplus > 199711L
+#define TINYEXR_HAS_CXX11 (0)
+#endif  // __cplusplus > 199711L
+
+#if TINYEXR_USE_OPENMP
+#include <omp.h>
+#endif
+
+#if defined(TINYEXR_USE_MINIZ) && (TINYEXR_USE_MINIZ==1)
+#include "../basisu_miniz.h"
+#else
+//  Issue #46. Please include your own zlib-compatible API header before
+//  including `tinyexr.h`
+//#include "zlib.h"
+#endif
+
+#if defined(TINYEXR_USE_NANOZLIB) && (TINYEXR_USE_NANOZLIB==1)
+#define NANOZLIB_IMPLEMENTATION
+#include "nanozlib.h"
+#endif
+
+#if TINYEXR_USE_STB_ZLIB
+// Since we don't know where a project has stb_image.h and stb_image_write.h
+// and whether they are in the include path, we don't include them here, and
+// instead declare the two relevant functions manually.
+// from stb_image.h:
+extern "C" int stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+// from stb_image_write.h:
+extern "C" unsigned char *stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality);
+#endif
+
+
+#if TINYEXR_USE_ZFP
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+#endif
+
+#include "zfp.h"
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif
+
+// cond: conditional expression
+// msg: std::string
+// err: std::string*
+#define TINYEXR_CHECK_AND_RETURN_MSG(cond, msg, err) do { \
+  if (!(cond)) { \
+    if (!err) { \
+      std::ostringstream ss_e; \
+      ss_e << __func__ << "():" << __LINE__ << msg << "\n"; \
+      (*err) += ss_e.str(); \
+    } \
+    return false;\
+  } \
+  } while(0)
+
+// no error message.
+#define TINYEXR_CHECK_AND_RETURN_C(cond, retcode) do { \
+  if (!(cond)) { \
+    return retcode; \
+  } \
+  } while(0)
+
+namespace tinyexr {
+
+#if __cplusplus > 199711L
+// C++11
+typedef uint64_t tinyexr_uint64;
+typedef int64_t tinyexr_int64;
+#else
+// Although `long long` is not a standard type pre C++11, assume it is defined
+// as a compiler's extension.
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-long-long"
+#endif
+typedef unsigned long long tinyexr_uint64;
+typedef long long tinyexr_int64;
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+#endif
+
+// static bool IsBigEndian(void) {
+//  union {
+//    unsigned int i;
+//    char c[4];
+//  } bint = {0x01020304};
+//
+//  return bint.c[0] == 1;
+//}
+
+static void SetErrorMessage(const std::string &msg, const char **err) {
+  if (err) {
+#ifdef _WIN32
+    (*err) = _strdup(msg.c_str());
+#else
+    (*err) = strdup(msg.c_str());
+#endif
+  }
+}
+
+#if 0
+static void SetWarningMessage(const std::string &msg, const char **warn) {
+  if (warn) {
+#ifdef _WIN32
+    (*warn) = _strdup(msg.c_str());
+#else
+    (*warn) = strdup(msg.c_str());
+#endif
+  }
+}
+#endif
+
+static const int kEXRVersionSize = 8;
+
+static void cpy2(unsigned short *dst_val, const unsigned short *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+}
+
+static void swap2(unsigned short *val) {
+#if TINYEXR_LITTLE_ENDIAN
+  (void)val;
+#else
+  unsigned short tmp = *val;
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[1];
+  dst[1] = src[0];
+#endif
+}
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+static void cpy4(int *dst_val, const int *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+  dst[2] = src[2];
+  dst[3] = src[3];
+}
+
+static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+  dst[2] = src[2];
+  dst[3] = src[3];
+}
+
+static void cpy4(float *dst_val, const float *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+  dst[2] = src[2];
+  dst[3] = src[3];
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+static void swap4(unsigned int *val) {
+#if TINYEXR_LITTLE_ENDIAN
+  (void)val;
+#else
+  unsigned int tmp = *val;
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[3];
+  dst[1] = src[2];
+  dst[2] = src[1];
+  dst[3] = src[0];
+#endif
+}
+
+static void swap4(int *val) {
+#if TINYEXR_LITTLE_ENDIAN
+  (void)val;
+#else
+  int tmp = *val;
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[3];
+  dst[1] = src[2];
+  dst[2] = src[1];
+  dst[3] = src[0];
+#endif
+}
+
+static void swap4(float *val) {
+#if TINYEXR_LITTLE_ENDIAN
+  (void)val;
+#else
+  float tmp = *val;
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[3];
+  dst[1] = src[2];
+  dst[2] = src[1];
+  dst[3] = src[0];
+#endif
+}
+
+#if 0
+static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+  dst[2] = src[2];
+  dst[3] = src[3];
+  dst[4] = src[4];
+  dst[5] = src[5];
+  dst[6] = src[6];
+  dst[7] = src[7];
+}
+#endif
+
+static void swap8(tinyexr::tinyexr_uint64 *val) {
+#if TINYEXR_LITTLE_ENDIAN
+  (void)val;
+#else
+  tinyexr::tinyexr_uint64 tmp = (*val);
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[7];
+  dst[1] = src[6];
+  dst[2] = src[5];
+  dst[3] = src[4];
+  dst[4] = src[3];
+  dst[5] = src[2];
+  dst[6] = src[1];
+  dst[7] = src[0];
+#endif
+}
+
+// https://gist.github.com/rygorous/2156668
+union FP32 {
+  unsigned int u;
+  float f;
+  struct {
+#if TINYEXR_LITTLE_ENDIAN
+    unsigned int Mantissa : 23;
+    unsigned int Exponent : 8;
+    unsigned int Sign : 1;
+#else
+    unsigned int Sign : 1;
+    unsigned int Exponent : 8;
+    unsigned int Mantissa : 23;
+#endif
+  } s;
+};
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+union FP16 {
+  unsigned short u;
+  struct {
+#if TINYEXR_LITTLE_ENDIAN
+    unsigned int Mantissa : 10;
+    unsigned int Exponent : 5;
+    unsigned int Sign : 1;
+#else
+    unsigned int Sign : 1;
+    unsigned int Exponent : 5;
+    unsigned int Mantissa : 10;
+#endif
+  } s;
+};
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+static FP32 half_to_float(FP16 h) {
+  static const FP32 magic = {113 << 23};
+  static const unsigned int shifted_exp = 0x7c00
+                                          << 13;  // exponent mask after shift
+  FP32 o;
+
+  o.u = (h.u & 0x7fffU) << 13U;           // exponent/mantissa bits
+  unsigned int exp_ = shifted_exp & o.u;  // just the exponent
+  o.u += (127 - 15) << 23;                // exponent adjust
+
+  // handle exponent special cases
+  if (exp_ == shifted_exp)    // Inf/NaN?
+    o.u += (128 - 16) << 23;  // extra exp adjust
+  else if (exp_ == 0)         // Zero/Denormal?
+  {
+    o.u += 1 << 23;  // extra exp adjust
+    o.f -= magic.f;  // renormalize
+  }
+
+  o.u |= (h.u & 0x8000U) << 16U;  // sign bit
+  return o;
+}
+
+static FP16 float_to_half_full(FP32 f) {
+  FP16 o = {0};
+
+  // Based on ISPC reference code (with minor modifications)
+  if (f.s.Exponent == 0)  // Signed zero/denormal (which will underflow)
+    o.s.Exponent = 0;
+  else if (f.s.Exponent == 255)  // Inf or NaN (all exponent bits set)
+  {
+    o.s.Exponent = 31;
+    o.s.Mantissa = f.s.Mantissa ? 0x200 : 0;  // NaN->qNaN and Inf->Inf
+  } else                                      // Normalized number
+  {
+    // Exponent unbias the single, then bias the halfp
+    int newexp = f.s.Exponent - 127 + 15;
+    if (newexp >= 31)  // Overflow, return signed infinity
+      o.s.Exponent = 31;
+    else if (newexp <= 0)  // Underflow
+    {
+      if ((14 - newexp) <= 24)  // Mantissa might be non-zero
+      {
+        unsigned int mant = f.s.Mantissa | 0x800000;  // Hidden 1 bit
+        o.s.Mantissa = mant >> (14 - newexp);
+        if ((mant >> (13 - newexp)) & 1)  // Check for rounding
+          o.u++;  // Round, might overflow into exp bit, but this is OK
+      }
+    } else {
+      o.s.Exponent = static_cast<unsigned int>(newexp);
+      o.s.Mantissa = f.s.Mantissa >> 13;
+      if (f.s.Mantissa & 0x1000)  // Check for rounding
+        o.u++;                    // Round, might overflow to inf, this is OK
+    }
+  }
+
+  o.s.Sign = f.s.Sign;
+  return o;
+}
+
+// NOTE: From OpenEXR code
+// #define IMF_INCREASING_Y  0
+// #define IMF_DECREASING_Y  1
+// #define IMF_RAMDOM_Y    2
+//
+// #define IMF_NO_COMPRESSION  0
+// #define IMF_RLE_COMPRESSION 1
+// #define IMF_ZIPS_COMPRESSION  2
+// #define IMF_ZIP_COMPRESSION 3
+// #define IMF_PIZ_COMPRESSION 4
+// #define IMF_PXR24_COMPRESSION 5
+// #define IMF_B44_COMPRESSION 6
+// #define IMF_B44A_COMPRESSION  7
+
+#ifdef __clang__
+#pragma clang diagnostic push
+
+#if __has_warning("-Wzero-as-null-pointer-constant")
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
+#endif
+
+#endif
+
+static const char *ReadString(std::string *s, const char *ptr, size_t len) {
+  // Read untile NULL(\0).
+  const char *p = ptr;
+  const char *q = ptr;
+  while ((size_t(q - ptr) < len) && (*q) != 0) {
+    q++;
+  }
+
+  if (size_t(q - ptr) >= len) {
+    (*s).clear();
+    return NULL;
+  }
+
+  (*s) = std::string(p, q);
+
+  return q + 1;  // skip '\0'
+}
+
+static bool ReadAttribute(std::string *name, std::string *type,
+                          std::vector<unsigned char> *data, size_t *marker_size,
+                          const char *marker, size_t size) {
+  size_t name_len = strnlen(marker, size);
+  if (name_len == size) {
+    // String does not have a terminating character.
+    return false;
+  }
+  *name = std::string(marker, name_len);
+
+  marker += name_len + 1;
+  size -= name_len + 1;
+
+  size_t type_len = strnlen(marker, size);
+  if (type_len == size) {
+    return false;
+  }
+  *type = std::string(marker, type_len);
+
+  marker += type_len + 1;
+  size -= type_len + 1;
+
+  if (size < sizeof(uint32_t)) {
+    return false;
+  }
+
+  uint32_t data_len;
+  memcpy(&data_len, marker, sizeof(uint32_t));
+  tinyexr::swap4(reinterpret_cast<unsigned int *>(&data_len));
+
+  if (data_len == 0) {
+    if ((*type).compare("string") == 0) {
+      // Accept empty string attribute.
+
+      marker += sizeof(uint32_t);
+      size -= sizeof(uint32_t);
+
+      *marker_size = name_len + 1 + type_len + 1 + sizeof(uint32_t);
+
+      data->resize(1);
+      (*data)[0] = '\0';
+
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  marker += sizeof(uint32_t);
+  size -= sizeof(uint32_t);
+
+  if (size < data_len) {
+    return false;
+  }
+
+  data->resize(static_cast<size_t>(data_len));
+  memcpy(&data->at(0), marker, static_cast<size_t>(data_len));
+
+  *marker_size = name_len + 1 + type_len + 1 + sizeof(uint32_t) + data_len;
+  return true;
+}
+
+static void WriteAttributeToMemory(std::vector<unsigned char> *out,
+                                   const char *name, const char *type,
+                                   const unsigned char *data, int len) {
+  out->insert(out->end(), name, name + strlen(name) + 1);
+  out->insert(out->end(), type, type + strlen(type) + 1);
+
+  int outLen = len;
+  tinyexr::swap4(&outLen);
+  out->insert(out->end(), reinterpret_cast<unsigned char *>(&outLen),
+              reinterpret_cast<unsigned char *>(&outLen) + sizeof(int));
+  out->insert(out->end(), data, data + len);
+}
+
+typedef struct TChannelInfo {
+  std::string name;  // less than 255 bytes long
+  int pixel_type;
+  int requested_pixel_type;
+  int x_sampling;
+  int y_sampling;
+  unsigned char p_linear;
+  unsigned char pad[3];
+} ChannelInfo;
+
+typedef struct {
+  int min_x;
+  int min_y;
+  int max_x;
+  int max_y;
+} Box2iInfo;
+
+struct HeaderInfo {
+  std::vector<tinyexr::ChannelInfo> channels;
+  std::vector<EXRAttribute> attributes;
+
+  Box2iInfo data_window;
+  int line_order;
+  Box2iInfo display_window;
+  float screen_window_center[2];
+  float screen_window_width;
+  float pixel_aspect_ratio;
+
+  int chunk_count;
+
+  // Tiled format
+  int tiled; // Non-zero if the part is tiled.
+  int tile_size_x;
+  int tile_size_y;
+  int tile_level_mode;
+  int tile_rounding_mode;
+
+  unsigned int header_len;
+
+  int compression_type;
+
+  // required for multi-part or non-image files
+  std::string name;
+  // required for multi-part or non-image files
+  std::string type;
+
+  void clear() {
+    channels.clear();
+    attributes.clear();
+
+    data_window.min_x = 0;
+    data_window.min_y = 0;
+    data_window.max_x = 0;
+    data_window.max_y = 0;
+    line_order = 0;
+    display_window.min_x = 0;
+    display_window.min_y = 0;
+    display_window.max_x = 0;
+    display_window.max_y = 0;
+    screen_window_center[0] = 0.0f;
+    screen_window_center[1] = 0.0f;
+    screen_window_width = 0.0f;
+    pixel_aspect_ratio = 0.0f;
+
+    chunk_count = 0;
+
+    // Tiled format
+    tiled = 0;
+    tile_size_x = 0;
+    tile_size_y = 0;
+    tile_level_mode = 0;
+    tile_rounding_mode = 0;
+
+    header_len = 0;
+    compression_type = 0;
+
+    name.clear();
+    type.clear();
+  }
+};
+
+static bool ReadChannelInfo(std::vector<ChannelInfo> &channels,
+                            const std::vector<unsigned char> &data) {
+  const char *p = reinterpret_cast<const char *>(&data.at(0));
+
+  for (;;) {
+    if ((*p) == 0) {
+      break;
+    }
+    ChannelInfo info;
+    info.requested_pixel_type = 0;
+
+    tinyexr_int64 data_len = static_cast<tinyexr_int64>(data.size()) -
+                             (p - reinterpret_cast<const char *>(data.data()));
+    if (data_len < 0) {
+      return false;
+    }
+
+    p = ReadString(&info.name, p, size_t(data_len));
+    if ((p == NULL) && (info.name.empty())) {
+      // Buffer overrun. Issue #51.
+      return false;
+    }
+
+    const unsigned char *data_end =
+        reinterpret_cast<const unsigned char *>(p) + 16;
+    if (data_end >= (data.data() + data.size())) {
+      return false;
+    }
+
+    memcpy(&info.pixel_type, p, sizeof(int));
+    p += 4;
+    info.p_linear = static_cast<unsigned char>(p[0]);  // uchar
+    p += 1 + 3;                                        // reserved: uchar[3]
+    memcpy(&info.x_sampling, p, sizeof(int));          // int
+    p += 4;
+    memcpy(&info.y_sampling, p, sizeof(int));  // int
+    p += 4;
+
+    tinyexr::swap4(&info.pixel_type);
+    tinyexr::swap4(&info.x_sampling);
+    tinyexr::swap4(&info.y_sampling);
+
+    channels.push_back(info);
+  }
+
+  return true;
+}
+
+static void WriteChannelInfo(std::vector<unsigned char> &data,
+                             const std::vector<ChannelInfo> &channels) {
+  size_t sz = 0;
+
+  // Calculate total size.
+  for (size_t c = 0; c < channels.size(); c++) {
+    sz += channels[c].name.length() + 1;  // +1 for \0
+    sz += 16;                                    // 4 * int
+  }
+  data.resize(sz + 1);
+
+  unsigned char *p = &data.at(0);
+
+  for (size_t c = 0; c < channels.size(); c++) {
+    memcpy(p, channels[c].name.c_str(), channels[c].name.length());
+    p += channels[c].name.length();
+    (*p) = '\0';
+    p++;
+
+    int pixel_type = channels[c].requested_pixel_type;
+    int x_sampling = channels[c].x_sampling;
+    int y_sampling = channels[c].y_sampling;
+    tinyexr::swap4(&pixel_type);
+    tinyexr::swap4(&x_sampling);
+    tinyexr::swap4(&y_sampling);
+
+    memcpy(p, &pixel_type, sizeof(int));
+    p += sizeof(int);
+
+    (*p) = channels[c].p_linear;
+    p += 4;
+
+    memcpy(p, &x_sampling, sizeof(int));
+    p += sizeof(int);
+
+    memcpy(p, &y_sampling, sizeof(int));
+    p += sizeof(int);
+  }
+
+  (*p) = '\0';
+}
+
+static bool CompressZip(unsigned char *dst,
+                        tinyexr::tinyexr_uint64 &compressedSize,
+                        const unsigned char *src, unsigned long src_size) {
+  std::vector<unsigned char> tmpBuf(src_size);
+
+  //
+  // Apply EXR-specific? postprocess. Grabbed from OpenEXR's
+  // ImfZipCompressor.cpp
+  //
+
+  //
+  // Reorder the pixel data.
+  //
+
+  const char *srcPtr = reinterpret_cast<const char *>(src);
+
+  {
+    char *t1 = reinterpret_cast<char *>(&tmpBuf.at(0));
+    char *t2 = reinterpret_cast<char *>(&tmpBuf.at(0)) + (src_size + 1) / 2;
+    const char *stop = srcPtr + src_size;
+
+    for (;;) {
+      if (srcPtr < stop)
+        *(t1++) = *(srcPtr++);
+      else
+        break;
+
+      if (srcPtr < stop)
+        *(t2++) = *(srcPtr++);
+      else
+        break;
+    }
+  }
+
+  //
+  // Predictor.
+  //
+
+  {
+    unsigned char *t = &tmpBuf.at(0) + 1;
+    unsigned char *stop = &tmpBuf.at(0) + src_size;
+    int p = t[-1];
+
+    while (t < stop) {
+      int d = int(t[0]) - p + (128 + 256);
+      p = t[0];
+      t[0] = static_cast<unsigned char>(d);
+      ++t;
+    }
+  }
+
+#if defined(TINYEXR_USE_MINIZ) && (TINYEXR_USE_MINIZ==1)
+  //
+  // Compress the data using miniz
+  //
+
+  buminiz::mz_ulong outSize = buminiz::mz_compressBound(src_size);
+  int ret = buminiz::mz_compress(
+      dst, &outSize, static_cast<const unsigned char *>(&tmpBuf.at(0)),
+      src_size);
+  if (ret != buminiz::MZ_OK) {
+    return false;
+  }
+
+  compressedSize = outSize;
+#elif defined(TINYEXR_USE_STB_ZLIB) && (TINYEXR_USE_STB_ZLIB==1)
+  int outSize;
+  unsigned char* ret = stbi_zlib_compress(const_cast<unsigned char*>(&tmpBuf.at(0)), src_size, &outSize, 8);
+  if (!ret) {
+    return false;
+  }
+  memcpy(dst, ret, outSize);
+  free(ret);
+
+  compressedSize = outSize;
+#elif defined(TINYEXR_USE_NANOZLIB) && (TINYEXR_USE_NANOZLIB==1)
+  uint64_t dstSize = nanoz_compressBound(static_cast<uint64_t>(src_size));
+  int outSize{0};
+  unsigned char *ret = nanoz_compress(&tmpBuf.at(0), src_size, &outSize, /* quality */8);
+  if (!ret) {
+    return false;
+  }
+
+  memcpy(dst, ret, outSize);
+  free(ret);
+  
+  compressedSize = outSize;
+#else
+  uLong outSize = compressBound(static_cast<uLong>(src_size));
+  int ret = compress(dst, &outSize, static_cast<const Bytef *>(&tmpBuf.at(0)),
+                     src_size);
+  if (ret != Z_OK) {
+    return false;
+  }
+
+  compressedSize = outSize;
+#endif
+
+  // Use uncompressed data when compressed data is larger than uncompressed.
+  // (Issue 40)
+  if (compressedSize >= src_size) {
+    compressedSize = src_size;
+    memcpy(dst, src, src_size);
+  }
+
+  return true;
+}
+
+static bool DecompressZip(unsigned char *dst,
+                          unsigned long *uncompressed_size /* inout */,
+                          const unsigned char *src, unsigned long src_size) {
+  if ((*uncompressed_size) == src_size) {
+    // Data is not compressed(Issue 40).
+    memcpy(dst, src, src_size);
+    return true;
+  }
+  std::vector<unsigned char> tmpBuf(*uncompressed_size);
+
+#if defined(TINYEXR_USE_MINIZ) && (TINYEXR_USE_MINIZ==1)
+  int ret =
+      buminiz::mz_uncompress(&tmpBuf.at(0), uncompressed_size, src, src_size);
+  if (buminiz::MZ_OK != ret) {
+    return false;
+  }
+#elif TINYEXR_USE_STB_ZLIB
+  int ret = stbi_zlib_decode_buffer(reinterpret_cast<char*>(&tmpBuf.at(0)),
+      *uncompressed_size, reinterpret_cast<const char*>(src), src_size);
+  if (ret < 0) {
+    return false;
+  }
+#elif defined(TINYEXR_USE_NANOZLIB) && (TINYEXR_USE_NANOZLIB==1)
+  uint64_t dest_size = (*uncompressed_size);
+  uint64_t uncomp_size{0};
+  nanoz_status_t ret =
+      nanoz_uncompress(src, src_size, dest_size, &tmpBuf.at(0), &uncomp_size);
+  if (NANOZ_SUCCESS != ret) {
+    return false;
+  }
+  if ((*uncompressed_size) != uncomp_size) {
+    return false;
+  }
+#else
+  int ret = uncompress(&tmpBuf.at(0), uncompressed_size, src, src_size);
+  if (Z_OK != ret) {
+    return false;
+  }
+#endif
+
+  //
+  // Apply EXR-specific? postprocess. Grabbed from OpenEXR's
+  // ImfZipCompressor.cpp
+  //
+
+  // Predictor.
+  {
+    unsigned char *t = &tmpBuf.at(0) + 1;
+    unsigned char *stop = &tmpBuf.at(0) + (*uncompressed_size);
+
+    while (t < stop) {
+      int d = int(t[-1]) + int(t[0]) - 128;
+      t[0] = static_cast<unsigned char>(d);
+      ++t;
+    }
+  }
+
+  // Reorder the pixel data.
+  {
+    const char *t1 = reinterpret_cast<const char *>(&tmpBuf.at(0));
+    const char *t2 = reinterpret_cast<const char *>(&tmpBuf.at(0)) +
+                     (*uncompressed_size + 1) / 2;
+    char *s = reinterpret_cast<char *>(dst);
+    char *stop = s + (*uncompressed_size);
+
+    for (;;) {
+      if (s < stop)
+        *(s++) = *(t1++);
+      else
+        break;
+
+      if (s < stop)
+        *(s++) = *(t2++);
+      else
+        break;
+    }
+  }
+
+  return true;
+}
+
+// RLE code from OpenEXR --------------------------------------
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#if __has_warning("-Wextra-semi-stmt")
+#pragma clang diagnostic ignored "-Wextra-semi-stmt"
+#endif
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4204)  // nonstandard extension used : non-constant
+                                 // aggregate initializer (also supported by GNU
+                                 // C and C99, so no big deal)
+#pragma warning(disable : 4244)  // 'initializing': conversion from '__int64' to
+                                 // 'int', possible loss of data
+#pragma warning(disable : 4267)  // 'argument': conversion from '__int64' to
+                                 // 'int', possible loss of data
+#pragma warning(disable : 4996)  // 'strdup': The POSIX name for this item is
+                                 // deprecated. Instead, use the ISO C and C++
+                                 // conformant name: _strdup.
+#endif
+
+const int MIN_RUN_LENGTH = 3;
+const int MAX_RUN_LENGTH = 127;
+
+//
+// Compress an array of bytes, using run-length encoding,
+// and return the length of the compressed data.
+//
+
+static int rleCompress(int inLength, const char in[], signed char out[]) {
+  const char *inEnd = in + inLength;
+  const char *runStart = in;
+  const char *runEnd = in + 1;
+  signed char *outWrite = out;
+
+  while (runStart < inEnd) {
+    while (runEnd < inEnd && *runStart == *runEnd &&
+           runEnd - runStart - 1 < MAX_RUN_LENGTH) {
+      ++runEnd;
+    }
+
+    if (runEnd - runStart >= MIN_RUN_LENGTH) {
+      //
+      // Compressible run
+      //
+
+      *outWrite++ = static_cast<char>(runEnd - runStart) - 1;
+      *outWrite++ = *(reinterpret_cast<const signed char *>(runStart));
+      runStart = runEnd;
+    } else {
+      //
+      // Uncompressable run
+      //
+
+      while (runEnd < inEnd &&
+             ((runEnd + 1 >= inEnd || *runEnd != *(runEnd + 1)) ||
+              (runEnd + 2 >= inEnd || *(runEnd + 1) != *(runEnd + 2))) &&
+             runEnd - runStart < MAX_RUN_LENGTH) {
+        ++runEnd;
+      }
+
+      *outWrite++ = static_cast<char>(runStart - runEnd);
+
+      while (runStart < runEnd) {
+        *outWrite++ = *(reinterpret_cast<const signed char *>(runStart++));
+      }
+    }
+
+    ++runEnd;
+  }
+
+  return static_cast<int>(outWrite - out);
+}
+
+//
+// Uncompress an array of bytes compressed with rleCompress().
+// Returns the length of the uncompressed data, or 0 if the
+// length of the uncompressed data would be more than maxLength.
+//
+
+static int rleUncompress(int inLength, int maxLength, const signed char in[],
+                         char out[]) {
+  char *outStart = out;
+
+  while (inLength > 0) {
+    if (*in < 0) {
+      int count = -(static_cast<int>(*in++));
+      inLength -= count + 1;
+
+      // Fixes #116: Add bounds check to in buffer.
+      if ((0 > (maxLength -= count)) || (inLength < 0)) return 0;
+
+      memcpy(out, in, count);
+      out += count;
+      in += count;
+    } else {
+      int count = *in++;
+      inLength -= 2;
+
+      if ((0 > (maxLength -= count + 1)) || (inLength < 0)) return 0;
+
+      memset(out, *reinterpret_cast<const char *>(in), count + 1);
+      out += count + 1;
+
+      in++;
+    }
+  }
+
+  return static_cast<int>(out - outStart);
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+// End of RLE code from OpenEXR -----------------------------------
+
+static bool CompressRle(unsigned char *dst,
+                        tinyexr::tinyexr_uint64 &compressedSize,
+                        const unsigned char *src, unsigned long src_size) {
+  std::vector<unsigned char> tmpBuf(src_size);
+
+  //
+  // Apply EXR-specific? postprocess. Grabbed from OpenEXR's
+  // ImfRleCompressor.cpp
+  //
+
+  //
+  // Reorder the pixel data.
+  //
+
+  const char *srcPtr = reinterpret_cast<const char *>(src);
+
+  {
+    char *t1 = reinterpret_cast<char *>(&tmpBuf.at(0));
+    char *t2 = reinterpret_cast<char *>(&tmpBuf.at(0)) + (src_size + 1) / 2;
+    const char *stop = srcPtr + src_size;
+
+    for (;;) {
+      if (srcPtr < stop)
+        *(t1++) = *(srcPtr++);
+      else
+        break;
+
+      if (srcPtr < stop)
+        *(t2++) = *(srcPtr++);
+      else
+        break;
+    }
+  }
+
+  //
+  // Predictor.
+  //
+
+  {
+    unsigned char *t = &tmpBuf.at(0) + 1;
+    unsigned char *stop = &tmpBuf.at(0) + src_size;
+    int p = t[-1];
+
+    while (t < stop) {
+      int d = int(t[0]) - p + (128 + 256);
+      p = t[0];
+      t[0] = static_cast<unsigned char>(d);
+      ++t;
+    }
+  }
+
+  // outSize will be (srcSiz * 3) / 2 at max.
+  int outSize = rleCompress(static_cast<int>(src_size),
+                            reinterpret_cast<const char *>(&tmpBuf.at(0)),
+                            reinterpret_cast<signed char *>(dst));
+  TINYEXR_CHECK_AND_RETURN_C(outSize > 0, false);
+
+  compressedSize = static_cast<tinyexr::tinyexr_uint64>(outSize);
+
+  // Use uncompressed data when compressed data is larger than uncompressed.
+  // (Issue 40)
+  if (compressedSize >= src_size) {
+    compressedSize = src_size;
+    memcpy(dst, src, src_size);
+  }
+
+  return true;
+}
+
+static bool DecompressRle(unsigned char *dst,
+                          const unsigned long uncompressed_size,
+                          const unsigned char *src, unsigned long src_size) {
+  if (uncompressed_size == src_size) {
+    // Data is not compressed(Issue 40).
+    memcpy(dst, src, src_size);
+    return true;
+  }
+
+  // Workaround for issue #112.
+  // TODO(syoyo): Add more robust out-of-bounds check in `rleUncompress`.
+  if (src_size <= 2) {
+    return false;
+  }
+
+  std::vector<unsigned char> tmpBuf(uncompressed_size);
+
+  int ret = rleUncompress(static_cast<int>(src_size),
+                          static_cast<int>(uncompressed_size),
+                          reinterpret_cast<const signed char *>(src),
+                          reinterpret_cast<char *>(&tmpBuf.at(0)));
+  if (ret != static_cast<int>(uncompressed_size)) {
+    return false;
+  }
+
+  //
+  // Apply EXR-specific? postprocess. Grabbed from OpenEXR's
+  // ImfRleCompressor.cpp
+  //
+
+  // Predictor.
+  {
+    unsigned char *t = &tmpBuf.at(0) + 1;
+    unsigned char *stop = &tmpBuf.at(0) + uncompressed_size;
+
+    while (t < stop) {
+      int d = int(t[-1]) + int(t[0]) - 128;
+      t[0] = static_cast<unsigned char>(d);
+      ++t;
+    }
+  }
+
+  // Reorder the pixel data.
+  {
+    const char *t1 = reinterpret_cast<const char *>(&tmpBuf.at(0));
+    const char *t2 = reinterpret_cast<const char *>(&tmpBuf.at(0)) +
+                     (uncompressed_size + 1) / 2;
+    char *s = reinterpret_cast<char *>(dst);
+    char *stop = s + uncompressed_size;
+
+    for (;;) {
+      if (s < stop)
+        *(s++) = *(t1++);
+      else
+        break;
+
+      if (s < stop)
+        *(s++) = *(t2++);
+      else
+        break;
+    }
+  }
+
+  return true;
+}
+
+#if TINYEXR_USE_PIZ
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-long-long"
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wpadded"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+#pragma clang diagnostic ignored "-Wconversion"
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+
+#if __has_warning("-Wcast-qual")
+#pragma clang diagnostic ignored "-Wcast-qual"
+#endif
+
+#if __has_warning("-Wextra-semi-stmt")
+#pragma clang diagnostic ignored "-Wextra-semi-stmt"
+#endif
+
+#endif
+
+//
+// PIZ compress/uncompress, based on OpenEXR's ImfPizCompressor.cpp
+//
+// -----------------------------------------------------------------
+// Copyright (c) 2004, Industrial Light & Magic, a division of Lucas
+// Digital Ltd. LLC)
+// (3 clause BSD license)
+//
+
+struct PIZChannelData {
+  unsigned short *start;
+  unsigned short *end;
+  int nx;
+  int ny;
+  int ys;
+  int size;
+};
+
+//-----------------------------------------------------------------------------
+//
+//  16-bit Haar Wavelet encoding and decoding
+//
+//  The source code in this file is derived from the encoding
+//  and decoding routines written by Christian Rouet for his
+//  PIZ image file format.
+//
+//-----------------------------------------------------------------------------
+
+//
+// Wavelet basis functions without modulo arithmetic; they produce
+// the best compression ratios when the wavelet-transformed data are
+// Huffman-encoded, but the wavelet transform works only for 14-bit
+// data (untransformed data values must be less than (1 << 14)).
+//
+
+inline void wenc14(unsigned short a, unsigned short b, unsigned short &l,
+                   unsigned short &h) {
+  short as = static_cast<short>(a);
+  short bs = static_cast<short>(b);
+
+  short ms = (as + bs) >> 1;
+  short ds = as - bs;
+
+  l = static_cast<unsigned short>(ms);
+  h = static_cast<unsigned short>(ds);
+}
+
+inline void wdec14(unsigned short l, unsigned short h, unsigned short &a,
+                   unsigned short &b) {
+  short ls = static_cast<short>(l);
+  short hs = static_cast<short>(h);
+
+  int hi = hs;
+  int ai = ls + (hi & 1) + (hi >> 1);
+
+  short as = static_cast<short>(ai);
+  short bs = static_cast<short>(ai - hi);
+
+  a = static_cast<unsigned short>(as);
+  b = static_cast<unsigned short>(bs);
+}
+
+//
+// Wavelet basis functions with modulo arithmetic; they work with full
+// 16-bit data, but Huffman-encoding the wavelet-transformed data doesn't
+// compress the data quite as well.
+//
+
+const int NBITS = 16;
+const int A_OFFSET = 1 << (NBITS - 1);
+const int M_OFFSET = 1 << (NBITS - 1);
+const int MOD_MASK = (1 << NBITS) - 1;
+
+inline void wenc16(unsigned short a, unsigned short b, unsigned short &l,
+                   unsigned short &h) {
+  int ao = (a + A_OFFSET) & MOD_MASK;
+  int m = ((ao + b) >> 1);
+  int d = ao - b;
+
+  if (d < 0) m = (m + M_OFFSET) & MOD_MASK;
+
+  d &= MOD_MASK;
+
+  l = static_cast<unsigned short>(m);
+  h = static_cast<unsigned short>(d);
+}
+
+inline void wdec16(unsigned short l, unsigned short h, unsigned short &a,
+                   unsigned short &b) {
+  int m = l;
+  int d = h;
+  int bb = (m - (d >> 1)) & MOD_MASK;
+  int aa = (d + bb - A_OFFSET) & MOD_MASK;
+  b = static_cast<unsigned short>(bb);
+  a = static_cast<unsigned short>(aa);
+}
+
+//
+// 2D Wavelet encoding:
+//
+
+static void wav2Encode(
+    unsigned short *in,  // io: values are transformed in place
+    int nx,              // i : x size
+    int ox,              // i : x offset
+    int ny,              // i : y size
+    int oy,              // i : y offset
+    unsigned short mx)   // i : maximum in[x][y] value
+{
+  bool w14 = (mx < (1 << 14));
+  int n = (nx > ny) ? ny : nx;
+  int p = 1;   // == 1 <<  level
+  int p2 = 2;  // == 1 << (level+1)
+
+  //
+  // Hierarchical loop on smaller dimension n
+  //
+
+  while (p2 <= n) {
+    unsigned short *py = in;
+    unsigned short *ey = in + oy * (ny - p2);
+    int oy1 = oy * p;
+    int oy2 = oy * p2;
+    int ox1 = ox * p;
+    int ox2 = ox * p2;
+    unsigned short i00, i01, i10, i11;
+
+    //
+    // Y loop
+    //
+
+    for (; py <= ey; py += oy2) {
+      unsigned short *px = py;
+      unsigned short *ex = py + ox * (nx - p2);
+
+      //
+      // X loop
+      //
+
+      for (; px <= ex; px += ox2) {
+        unsigned short *p01 = px + ox1;
+        unsigned short *p10 = px + oy1;
+        unsigned short *p11 = p10 + ox1;
+
+        //
+        // 2D wavelet encoding
+        //
+
+        if (w14) {
+          wenc14(*px, *p01, i00, i01);
+          wenc14(*p10, *p11, i10, i11);
+          wenc14(i00, i10, *px, *p10);
+          wenc14(i01, i11, *p01, *p11);
+        } else {
+          wenc16(*px, *p01, i00, i01);
+          wenc16(*p10, *p11, i10, i11);
+          wenc16(i00, i10, *px, *p10);
+          wenc16(i01, i11, *p01, *p11);
+        }
+      }
+
+      //
+      // Encode (1D) odd column (still in Y loop)
+      //
+
+      if (nx & p) {
+        unsigned short *p10 = px + oy1;
+
+        if (w14)
+          wenc14(*px, *p10, i00, *p10);
+        else
+          wenc16(*px, *p10, i00, *p10);
+
+        *px = i00;
+      }
+    }
+
+    //
+    // Encode (1D) odd line (must loop in X)
+    //
+
+    if (ny & p) {
+      unsigned short *px = py;
+      unsigned short *ex = py + ox * (nx - p2);
+
+      for (; px <= ex; px += ox2) {
+        unsigned short *p01 = px + ox1;
+
+        if (w14)
+          wenc14(*px, *p01, i00, *p01);
+        else
+          wenc16(*px, *p01, i00, *p01);
+
+        *px = i00;
+      }
+    }
+
+    //
+    // Next level
+    //
+
+    p = p2;
+    p2 <<= 1;
+  }
+}
+
+//
+// 2D Wavelet decoding:
+//
+
+static void wav2Decode(
+    unsigned short *in,  // io: values are transformed in place
+    int nx,              // i : x size
+    int ox,              // i : x offset
+    int ny,              // i : y size
+    int oy,              // i : y offset
+    unsigned short mx)   // i : maximum in[x][y] value
+{
+  bool w14 = (mx < (1 << 14));
+  int n = (nx > ny) ? ny : nx;
+  int p = 1;
+  int p2;
+
+  //
+  // Search max level
+  //
+
+  while (p <= n) p <<= 1;
+
+  p >>= 1;
+  p2 = p;
+  p >>= 1;
+
+  //
+  // Hierarchical loop on smaller dimension n
+  //
+
+  while (p >= 1) {
+    unsigned short *py = in;
+    unsigned short *ey = in + oy * (ny - p2);
+    int oy1 = oy * p;
+    int oy2 = oy * p2;
+    int ox1 = ox * p;
+    int ox2 = ox * p2;
+    unsigned short i00, i01, i10, i11;
+
+    //
+    // Y loop
+    //
+
+    for (; py <= ey; py += oy2) {
+      unsigned short *px = py;
+      unsigned short *ex = py + ox * (nx - p2);
+
+      //
+      // X loop
+      //
+
+      for (; px <= ex; px += ox2) {
+        unsigned short *p01 = px + ox1;
+        unsigned short *p10 = px + oy1;
+        unsigned short *p11 = p10 + ox1;
+
+        //
+        // 2D wavelet decoding
+        //
+
+        if (w14) {
+          wdec14(*px, *p10, i00, i10);
+          wdec14(*p01, *p11, i01, i11);
+          wdec14(i00, i01, *px, *p01);
+          wdec14(i10, i11, *p10, *p11);
+        } else {
+          wdec16(*px, *p10, i00, i10);
+          wdec16(*p01, *p11, i01, i11);
+          wdec16(i00, i01, *px, *p01);
+          wdec16(i10, i11, *p10, *p11);
+        }
+      }
+
+      //
+      // Decode (1D) odd column (still in Y loop)
+      //
+
+      if (nx & p) {
+        unsigned short *p10 = px + oy1;
+
+        if (w14)
+          wdec14(*px, *p10, i00, *p10);
+        else
+          wdec16(*px, *p10, i00, *p10);
+
+        *px = i00;
+      }
+    }
+
+    //
+    // Decode (1D) odd line (must loop in X)
+    //
+
+    if (ny & p) {
+      unsigned short *px = py;
+      unsigned short *ex = py + ox * (nx - p2);
+
+      for (; px <= ex; px += ox2) {
+        unsigned short *p01 = px + ox1;
+
+        if (w14)
+          wdec14(*px, *p01, i00, *p01);
+        else
+          wdec16(*px, *p01, i00, *p01);
+
+        *px = i00;
+      }
+    }
+
+    //
+    // Next level
+    //
+
+    p2 = p;
+    p >>= 1;
+  }
+}
+
+//-----------------------------------------------------------------------------
+//
+//  16-bit Huffman compression and decompression.
+//
+//  The source code in this file is derived from the 8-bit
+//  Huffman compression and decompression routines written
+//  by Christian Rouet for his PIZ image file format.
+//
+//-----------------------------------------------------------------------------
+
+// Adds some modification for tinyexr.
+
+const int HUF_ENCBITS = 16;  // literal (value) bit length
+const int HUF_DECBITS = 14;  // decoding bit size (>= 8)
+
+const int HUF_ENCSIZE = (1 << HUF_ENCBITS) + 1;  // encoding table size
+const int HUF_DECSIZE = 1 << HUF_DECBITS;        // decoding table size
+const int HUF_DECMASK = HUF_DECSIZE - 1;
+
+struct HufDec {  // short code    long code
+  //-------------------------------
+  unsigned int len : 8;   // code length    0
+  unsigned int lit : 24;  // lit      p size
+  unsigned int *p;        // 0      lits
+};
+
+inline long long hufLength(long long code) { return code & 63; }
+
+inline long long hufCode(long long code) { return code >> 6; }
+
+inline void outputBits(int nBits, long long bits, long long &c, int &lc,
+                       char *&out) {
+  c <<= nBits;
+  lc += nBits;
+
+  c |= bits;
+
+  while (lc >= 8) *out++ = static_cast<char>((c >> (lc -= 8)));
+}
+
+inline long long getBits(int nBits, long long &c, int &lc, const char *&in) {
+  while (lc < nBits) {
+    c = (long long)((unsigned long long)c << 8) | *(reinterpret_cast<const unsigned char *>(in++));
+    lc += 8;
+  }
+
+  lc -= nBits;
+  return (c >> lc) & ((1 << nBits) - 1);
+}
+
+//
+// ENCODING TABLE BUILDING & (UN)PACKING
+//
+
+//
+// Build a "canonical" Huffman code table:
+//  - for each (uncompressed) symbol, hcode contains the length
+//    of the corresponding code (in the compressed data)
+//  - canonical codes are computed and stored in hcode
+//  - the rules for constructing canonical codes are as follows:
+//    * shorter codes (if filled with zeroes to the right)
+//      have a numerically higher value than longer codes
+//    * for codes with the same length, numerical values
+//      increase with numerical symbol values
+//  - because the canonical code table can be constructed from
+//    symbol lengths alone, the code table can be transmitted
+//    without sending the actual code values
+//  - see http://www.compressconsult.com/huffman/
+//
+
+static void hufCanonicalCodeTable(long long hcode[HUF_ENCSIZE]) {
+  long long n[59];
+
+  //
+  // For each i from 0 through 58, count the
+  // number of different codes of length i, and
+  // store the count in n[i].
+  //
+
+  for (int i = 0; i <= 58; ++i) n[i] = 0;
+
+  for (int i = 0; i < HUF_ENCSIZE; ++i) n[hcode[i]] += 1;
+
+  //
+  // For each i from 58 through 1, compute the
+  // numerically lowest code with length i, and
+  // store that code in n[i].
+  //
+
+  long long c = 0;
+
+  for (int i = 58; i > 0; --i) {
+    long long nc = ((c + n[i]) >> 1);
+    n[i] = c;
+    c = nc;
+  }
+
+  //
+  // hcode[i] contains the length, l, of the
+  // code for symbol i.  Assign the next available
+  // code of length l to the symbol and store both
+  // l and the code in hcode[i].
+  //
+
+  for (int i = 0; i < HUF_ENCSIZE; ++i) {
+    int l = static_cast<int>(hcode[i]);
+
+    if (l > 0) hcode[i] = l | (n[l]++ << 6);
+  }
+}
+
+//
+// Compute Huffman codes (based on frq input) and store them in frq:
+//  - code structure is : [63:lsb - 6:msb] | [5-0: bit length];
+//  - max code length is 58 bits;
+//  - codes outside the range [im-iM] have a null length (unused values);
+//  - original frequencies are destroyed;
+//  - encoding tables are used by hufEncode() and hufBuildDecTable();
+//
+
+struct FHeapCompare {
+  bool operator()(long long *a, long long *b) { return *a > *b; }
+};
+
+static bool hufBuildEncTable(
+    long long *frq,  // io: input frequencies [HUF_ENCSIZE], output table
+    int *im,         //  o: min frq index
+    int *iM)         //  o: max frq index
+{
+  //
+  // This function assumes that when it is called, array frq
+  // indicates the frequency of all possible symbols in the data
+  // that are to be Huffman-encoded.  (frq[i] contains the number
+  // of occurrences of symbol i in the data.)
+  //
+  // The loop below does three things:
+  //
+  // 1) Finds the minimum and maximum indices that point
+  //    to non-zero entries in frq:
+  //
+  //     frq[im] != 0, and frq[i] == 0 for all i < im
+  //     frq[iM] != 0, and frq[i] == 0 for all i > iM
+  //
+  // 2) Fills array fHeap with pointers to all non-zero
+  //    entries in frq.
+  //
+  // 3) Initializes array hlink such that hlink[i] == i
+  //    for all array entries.
+  //
+
+  std::vector<int> hlink(HUF_ENCSIZE);
+  std::vector<long long *> fHeap(HUF_ENCSIZE);
+
+  *im = 0;
+
+  while (!frq[*im]) (*im)++;
+
+  int nf = 0;
+
+  for (int i = *im; i < HUF_ENCSIZE; i++) {
+    hlink[i] = i;
+
+    if (frq[i]) {
+      fHeap[nf] = &frq[i];
+      nf++;
+      *iM = i;
+    }
+  }
+
+  //
+  // Add a pseudo-symbol, with a frequency count of 1, to frq;
+  // adjust the fHeap and hlink array accordingly.  Function
+  // hufEncode() uses the pseudo-symbol for run-length encoding.
+  //
+
+  (*iM)++;
+  frq[*iM] = 1;
+  fHeap[nf] = &frq[*iM];
+  nf++;
+
+  //
+  // Build an array, scode, such that scode[i] contains the number
+  // of bits assigned to symbol i.  Conceptually this is done by
+  // constructing a tree whose leaves are the symbols with non-zero
+  // frequency:
+  //
+  //     Make a heap that contains all symbols with a non-zero frequency,
+  //     with the least frequent symbol on top.
+  //
+  //     Repeat until only one symbol is left on the heap:
+  //
+  //         Take the two least frequent symbols off the top of the heap.
+  //         Create a new node that has first two nodes as children, and
+  //         whose frequency is the sum of the frequencies of the first
+  //         two nodes.  Put the new node back into the heap.
+  //
+  // The last node left on the heap is the root of the tree.  For each
+  // leaf node, the distance between the root and the leaf is the length
+  // of the code for the corresponding symbol.
+  //
+  // The loop below doesn't actually build the tree; instead we compute
+  // the distances of the leaves from the root on the fly.  When a new
+  // node is added to the heap, then that node's descendants are linked
+  // into a single linear list that starts at the new node, and the code
+  // lengths of the descendants (that is, their distance from the root
+  // of the tree) are incremented by one.
+  //
+
+  std::make_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
+
+  std::vector<long long> scode(HUF_ENCSIZE);
+  memset(scode.data(), 0, sizeof(long long) * HUF_ENCSIZE);
+
+  while (nf > 1) {
+    //
+    // Find the indices, mm and m, of the two smallest non-zero frq
+    // values in fHeap, add the smallest frq to the second-smallest
+    // frq, and remove the smallest frq value from fHeap.
+    //
+
+    int mm = fHeap[0] - frq;
+    std::pop_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
+    --nf;
+
+    int m = fHeap[0] - frq;
+    std::pop_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
+
+    frq[m] += frq[mm];
+    std::push_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
+
+    //
+    // The entries in scode are linked into lists with the
+    // entries in hlink serving as "next" pointers and with
+    // the end of a list marked by hlink[j] == j.
+    //
+    // Traverse the lists that start at scode[m] and scode[mm].
+    // For each element visited, increment the length of the
+    // corresponding code by one bit. (If we visit scode[j]
+    // during the traversal, then the code for symbol j becomes
+    // one bit longer.)
+    //
+    // Merge the lists that start at scode[m] and scode[mm]
+    // into a single list that starts at scode[m].
+    //
+
+    //
+    // Add a bit to all codes in the first list.
+    //
+
+    for (int j = m;; j = hlink[j]) {
+      scode[j]++;
+
+      TINYEXR_CHECK_AND_RETURN_C(scode[j] <= 58, false);
+
+      if (hlink[j] == j) {
+        //
+        // Merge the two lists.
+        //
+
+        hlink[j] = mm;
+        break;
+      }
+    }
+
+    //
+    // Add a bit to all codes in the second list
+    //
+
+    for (int j = mm;; j = hlink[j]) {
+      scode[j]++;
+
+      TINYEXR_CHECK_AND_RETURN_C(scode[j] <= 58, false);
+
+      if (hlink[j] == j) break;
+    }
+  }
+
+  //
+  // Build a canonical Huffman code table, replacing the code
+  // lengths in scode with (code, code length) pairs.  Copy the
+  // code table from scode into frq.
+  //
+
+  hufCanonicalCodeTable(scode.data());
+  memcpy(frq, scode.data(), sizeof(long long) * HUF_ENCSIZE);
+
+  return true;
+}
+
+//
+// Pack an encoding table:
+//  - only code lengths, not actual codes, are stored
+//  - runs of zeroes are compressed as follows:
+//
+//    unpacked    packed
+//    --------------------------------
+//    1 zero    0  (6 bits)
+//    2 zeroes    59
+//    3 zeroes    60
+//    4 zeroes    61
+//    5 zeroes    62
+//    n zeroes (6 or more)  63 n-6  (6 + 8 bits)
+//
+
+const int SHORT_ZEROCODE_RUN = 59;
+const int LONG_ZEROCODE_RUN = 63;
+const int SHORTEST_LONG_RUN = 2 + LONG_ZEROCODE_RUN - SHORT_ZEROCODE_RUN;
+const int LONGEST_LONG_RUN = 255 + SHORTEST_LONG_RUN;
+
+static void hufPackEncTable(
+    const long long *hcode,  // i : encoding table [HUF_ENCSIZE]
+    int im,                  // i : min hcode index
+    int iM,                  // i : max hcode index
+    char **pcode)            //  o: ptr to packed table (updated)
+{
+  char *p = *pcode;
+  long long c = 0;
+  int lc = 0;
+
+  for (; im <= iM; im++) {
+    int l = hufLength(hcode[im]);
+
+    if (l == 0) {
+      int zerun = 1;
+
+      while ((im < iM) && (zerun < LONGEST_LONG_RUN)) {
+        if (hufLength(hcode[im + 1]) > 0) break;
+        im++;
+        zerun++;
+      }
+
+      if (zerun >= 2) {
+        if (zerun >= SHORTEST_LONG_RUN) {
+          outputBits(6, LONG_ZEROCODE_RUN, c, lc, p);
+          outputBits(8, zerun - SHORTEST_LONG_RUN, c, lc, p);
+        } else {
+          outputBits(6, SHORT_ZEROCODE_RUN + zerun - 2, c, lc, p);
+        }
+        continue;
+      }
+    }
+
+    outputBits(6, l, c, lc, p);
+  }
+
+  if (lc > 0) *p++ = (unsigned char)(c << (8 - lc));
+
+  *pcode = p;
+}
+
+//
+// Unpack an encoding table packed by hufPackEncTable():
+//
+
+static bool hufUnpackEncTable(
+    const char **pcode,  // io: ptr to packed table (updated)
+    int ni,              // i : input size (in bytes)
+    int im,              // i : min hcode index
+    int iM,              // i : max hcode index
+    long long *hcode)    //  o: encoding table [HUF_ENCSIZE]
+{
+  memset(hcode, 0, sizeof(long long) * HUF_ENCSIZE);
+
+  const char *p = *pcode;
+  long long c = 0;
+  int lc = 0;
+
+  for (; im <= iM; im++) {
+    if (p - *pcode >= ni) {
+      return false;
+    }
+
+    long long l = hcode[im] = getBits(6, c, lc, p);  // code length
+
+    if (l == (long long)LONG_ZEROCODE_RUN) {
+      if (p - *pcode > ni) {
+        return false;
+      }
+
+      int zerun = getBits(8, c, lc, p) + SHORTEST_LONG_RUN;
+
+      if (im + zerun > iM + 1) {
+        return false;
+      }
+
+      while (zerun--) hcode[im++] = 0;
+
+      im--;
+    } else if (l >= (long long)SHORT_ZEROCODE_RUN) {
+      int zerun = l - SHORT_ZEROCODE_RUN + 2;
+
+      if (im + zerun > iM + 1) {
+        return false;
+      }
+
+      while (zerun--) hcode[im++] = 0;
+
+      im--;
+    }
+  }
+
+  *pcode = const_cast<char *>(p);
+
+  hufCanonicalCodeTable(hcode);
+
+  return true;
+}
+
+//
+// DECODING TABLE BUILDING
+//
+
+//
+// Clear a newly allocated decoding table so that it contains only zeroes.
+//
+
+static void hufClearDecTable(HufDec *hdecod)  // io: (allocated by caller)
+//     decoding table [HUF_DECSIZE]
+{
+  for (int i = 0; i < HUF_DECSIZE; i++) {
+    hdecod[i].len = 0;
+    hdecod[i].lit = 0;
+    hdecod[i].p = NULL;
+  }
+  // memset(hdecod, 0, sizeof(HufDec) * HUF_DECSIZE);
+}
+
+//
+// Build a decoding hash table based on the encoding table hcode:
+//  - short codes (<= HUF_DECBITS) are resolved with a single table access;
+//  - long code entry allocations are not optimized, because long codes are
+//    unfrequent;
+//  - decoding tables are used by hufDecode();
+//
+
+static bool hufBuildDecTable(const long long *hcode,  // i : encoding table
+                             int im,                  // i : min index in hcode
+                             int iM,                  // i : max index in hcode
+                             HufDec *hdecod)  //  o: (allocated by caller)
+//     decoding table [HUF_DECSIZE]
+{
+  //
+  // Init hashtable & loop on all codes.
+  // Assumes that hufClearDecTable(hdecod) has already been called.
+  //
+
+  for (; im <= iM; im++) {
+    long long c = hufCode(hcode[im]);
+    int l = hufLength(hcode[im]);
+
+    if (c >> l) {
+      //
+      // Error: c is supposed to be an l-bit code,
+      // but c contains a value that is greater
+      // than the largest l-bit number.
+      //
+
+      // invalidTableEntry();
+      return false;
+    }
+
+    if (l > HUF_DECBITS) {
+      //
+      // Long code: add a secondary entry
+      //
+
+      HufDec *pl = hdecod + (c >> (l - HUF_DECBITS));
+
+      if (pl->len) {
+        //
+        // Error: a short code has already
+        // been stored in table entry *pl.
+        //
+
+        // invalidTableEntry();
+        return false;
+      }
+
+      pl->lit++;
+
+      if (pl->p) {
+        unsigned int *p = pl->p;
+        pl->p = new unsigned int[pl->lit];
+
+        for (unsigned int i = 0; i < pl->lit - 1u; ++i) pl->p[i] = p[i];
+
+        delete[] p;
+      } else {
+        pl->p = new unsigned int[1];
+      }
+
+      pl->p[pl->lit - 1] = im;
+    } else if (l) {
+      //
+      // Short code: init all primary entries
+      //
+
+      HufDec *pl = hdecod + (c << (HUF_DECBITS - l));
+
+      for (long long i = 1ULL << (HUF_DECBITS - l); i > 0; i--, pl++) {
+        if (pl->len || pl->p) {
+          //
+          // Error: a short code or a long code has
+          // already been stored in table entry *pl.
+          //
+
+          // invalidTableEntry();
+          return false;
+        }
+
+        pl->len = l;
+        pl->lit = im;
+      }
+    }
+  }
+
+  return true;
+}
+
+//
+// Free the long code entries of a decoding table built by hufBuildDecTable()
+//
+
+static void hufFreeDecTable(HufDec *hdecod)  // io: Decoding table
+{
+  for (int i = 0; i < HUF_DECSIZE; i++) {
+    if (hdecod[i].p) {
+      delete[] hdecod[i].p;
+      hdecod[i].p = 0;
+    }
+  }
+}
+
+//
+// ENCODING
+//
+
+inline void outputCode(long long code, long long &c, int &lc, char *&out) {
+  outputBits(hufLength(code), hufCode(code), c, lc, out);
+}
+
+inline void sendCode(long long sCode, int runCount, long long runCode,
+                     long long &c, int &lc, char *&out) {
+  //
+  // Output a run of runCount instances of the symbol sCount.
+  // Output the symbols explicitly, or if that is shorter, output
+  // the sCode symbol once followed by a runCode symbol and runCount
+  // expressed as an 8-bit number.
+  //
+
+  if (hufLength(sCode) + hufLength(runCode) + 8 < hufLength(sCode) * runCount) {
+    outputCode(sCode, c, lc, out);
+    outputCode(runCode, c, lc, out);
+    outputBits(8, runCount, c, lc, out);
+  } else {
+    while (runCount-- >= 0) outputCode(sCode, c, lc, out);
+  }
+}
+
+//
+// Encode (compress) ni values based on the Huffman encoding table hcode:
+//
+
+static int hufEncode            // return: output size (in bits)
+    (const long long *hcode,    // i : encoding table
+     const unsigned short *in,  // i : uncompressed input buffer
+     const int ni,              // i : input buffer size (in bytes)
+     int rlc,                   // i : rl code
+     char *out)                 //  o: compressed output buffer
+{
+  char *outStart = out;
+  long long c = 0;  // bits not yet written to out
+  int lc = 0;       // number of valid bits in c (LSB)
+  int s = in[0];
+  int cs = 0;
+
+  //
+  // Loop on input values
+  //
+
+  for (int i = 1; i < ni; i++) {
+    //
+    // Count same values or send code
+    //
+
+    if (s == in[i] && cs < 255) {
+      cs++;
+    } else {
+      sendCode(hcode[s], cs, hcode[rlc], c, lc, out);
+      cs = 0;
+    }
+
+    s = in[i];
+  }
+
+  //
+  // Send remaining code
+  //
+
+  sendCode(hcode[s], cs, hcode[rlc], c, lc, out);
+
+  if (lc) *out = (c << (8 - lc)) & 0xff;
+
+  return (out - outStart) * 8 + lc;
+}
+
+//
+// DECODING
+//
+
+//
+// In order to force the compiler to inline them,
+// getChar() and getCode() are implemented as macros
+// instead of "inline" functions.
+//
+
+#define getChar(c, lc, in)                   \
+  {                                          \
+    c = ((unsigned long long)c << 8) | *(unsigned char *)(in++); \
+    lc += 8;                                 \
+  }
+
+#if 0
+#define getCode(po, rlc, c, lc, in, out, ob, oe) \
+  {                                              \
+    if (po == rlc) {                             \
+      if (lc < 8) getChar(c, lc, in);            \
+                                                 \
+      lc -= 8;                                   \
+                                                 \
+      unsigned char cs = (c >> lc);              \
+                                                 \
+      if (out + cs > oe) return false;           \
+                                                 \
+      /* TinyEXR issue 78 */                     \
+      unsigned short s = out[-1];                \
+                                                 \
+      while (cs-- > 0) *out++ = s;               \
+    } else if (out < oe) {                       \
+      *out++ = po;                               \
+    } else {                                     \
+      return false;                              \
+    }                                            \
+  }
+#else
+static bool getCode(int po, int rlc, long long &c, int &lc, const char *&in,
+                    const char *in_end, unsigned short *&out,
+                    const unsigned short *ob, const unsigned short *oe) {
+  (void)ob;
+  if (po == rlc) {
+    if (lc < 8) {
+      /* TinyEXR issue 78 */
+      /* TinyEXR issue 160. in + 1 -> in */
+      if (in >= in_end) {
+        return false;
+      }
+
+      getChar(c, lc, in);
+    }
+
+    lc -= 8;
+
+    unsigned char cs = (c >> lc);
+
+    if (out + cs > oe) return false;
+
+    // Bounds check for safety
+    // Issue 100.
+    if ((out - 1) < ob) return false;
+    unsigned short s = out[-1];
+
+    while (cs-- > 0) *out++ = s;
+  } else if (out < oe) {
+    *out++ = po;
+  } else {
+    return false;
+  }
+  return true;
+}
+#endif
+
+//
+// Decode (uncompress) ni bits based on encoding & decoding tables:
+//
+
+static bool hufDecode(const long long *hcode,  // i : encoding table
+                      const HufDec *hdecod,    // i : decoding table
+                      const char *in,          // i : compressed input buffer
+                      int ni,                  // i : input size (in bits)
+                      int rlc,                 // i : run-length code
+                      int no,  // i : expected output size (in bytes)
+                      unsigned short *out)  //  o: uncompressed output buffer
+{
+  long long c = 0;
+  int lc = 0;
+  unsigned short *outb = out;          // begin
+  unsigned short *oe = out + no;       // end
+  const char *ie = in + (ni + 7) / 8;  // input byte size
+
+  //
+  // Loop on input bytes
+  //
+
+  while (in < ie) {
+    getChar(c, lc, in);
+
+    //
+    // Access decoding table
+    //
+
+    while (lc >= HUF_DECBITS) {
+      const HufDec pl = hdecod[(c >> (lc - HUF_DECBITS)) & HUF_DECMASK];
+
+      if (pl.len) {
+        //
+        // Get short code
+        //
+
+        lc -= pl.len;
+        // std::cout << "lit = " << pl.lit << std::endl;
+        // std::cout << "rlc = " << rlc << std::endl;
+        // std::cout << "c = " << c << std::endl;
+        // std::cout << "lc = " << lc << std::endl;
+        // std::cout << "in = " << in << std::endl;
+        // std::cout << "out = " << out << std::endl;
+        // std::cout << "oe = " << oe << std::endl;
+        if (!getCode(pl.lit, rlc, c, lc, in, ie, out, outb, oe)) {
+          return false;
+        }
+      } else {
+        if (!pl.p) {
+          return false;
+        }
+        // invalidCode(); // wrong code
+
+        //
+        // Search long code
+        //
+
+        unsigned int j;
+
+        for (j = 0; j < pl.lit; j++) {
+          int l = hufLength(hcode[pl.p[j]]);
+
+          while (lc < l && in < ie)  // get more bits
+            getChar(c, lc, in);
+
+          if (lc >= l) {
+            if (hufCode(hcode[pl.p[j]]) ==
+                ((c >> (lc - l)) & (((long long)(1) << l) - 1))) {
+              //
+              // Found : get long code
+              //
+
+              lc -= l;
+              if (!getCode(pl.p[j], rlc, c, lc, in, ie, out, outb, oe)) {
+                return false;
+              }
+              break;
+            }
+          }
+        }
+
+        if (j == pl.lit) {
+          return false;
+          // invalidCode(); // Not found
+        }
+      }
+    }
+  }
+
+  //
+  // Get remaining (short) codes
+  //
+
+  int i = (8 - ni) & 7;
+  c >>= i;
+  lc -= i;
+
+  while (lc > 0) {
+    const HufDec pl = hdecod[((unsigned long long)c << (HUF_DECBITS - lc)) & HUF_DECMASK];
+
+    if (pl.len) {
+      lc -= pl.len;
+      if (!getCode(pl.lit, rlc, c, lc, in, ie, out, outb, oe)) {
+        return false;
+      }
+    } else {
+      return false;
+      // invalidCode(); // wrong (long) code
+    }
+  }
+
+  if (out - outb != no) {
+    return false;
+  }
+  // notEnoughData ();
+
+  return true;
+}
+
+static void countFrequencies(std::vector<long long> &freq,
+                             const unsigned short data[/*n*/], int n) {
+  for (int i = 0; i < HUF_ENCSIZE; ++i) freq[i] = 0;
+
+  for (int i = 0; i < n; ++i) ++freq[data[i]];
+}
+
+static void writeUInt(char buf[4], unsigned int i) {
+  unsigned char *b = (unsigned char *)buf;
+
+  b[0] = i;
+  b[1] = i >> 8;
+  b[2] = i >> 16;
+  b[3] = i >> 24;
+}
+
+static unsigned int readUInt(const char buf[4]) {
+  const unsigned char *b = (const unsigned char *)buf;
+
+  return (b[0] & 0x000000ff) | ((b[1] << 8) & 0x0000ff00) |
+         ((b[2] << 16) & 0x00ff0000) | ((b[3] << 24) & 0xff000000);
+}
+
+//
+// EXTERNAL INTERFACE
+//
+
+static int hufCompress(const unsigned short raw[], int nRaw,
+                       char compressed[]) {
+  if (nRaw == 0) return 0;
+
+  std::vector<long long> freq(HUF_ENCSIZE);
+
+  countFrequencies(freq, raw, nRaw);
+
+  int im = 0;
+  int iM = 0;
+  hufBuildEncTable(freq.data(), &im, &iM);
+
+  char *tableStart = compressed + 20;
+  char *tableEnd = tableStart;
+  hufPackEncTable(freq.data(), im, iM, &tableEnd);
+  int tableLength = tableEnd - tableStart;
+
+  char *dataStart = tableEnd;
+  int nBits = hufEncode(freq.data(), raw, nRaw, iM, dataStart);
+  int data_length = (nBits + 7) / 8;
+
+  writeUInt(compressed, im);
+  writeUInt(compressed + 4, iM);
+  writeUInt(compressed + 8, tableLength);
+  writeUInt(compressed + 12, nBits);
+  writeUInt(compressed + 16, 0);  // room for future extensions
+
+  return dataStart + data_length - compressed;
+}
+
+static bool hufUncompress(const char compressed[], int nCompressed,
+                          std::vector<unsigned short> *raw) {
+  if (nCompressed == 0) {
+    if (raw->size() != 0) return false;
+
+    return false;
+  }
+
+  int im = readUInt(compressed);
+  int iM = readUInt(compressed + 4);
+  // int tableLength = readUInt (compressed + 8);
+  int nBits = readUInt(compressed + 12);
+
+  if (im < 0 || im >= HUF_ENCSIZE || iM < 0 || iM >= HUF_ENCSIZE) return false;
+
+  const char *ptr = compressed + 20;
+
+  //
+  // Fast decoder needs at least 2x64-bits of compressed data, and
+  // needs to be run-able on this platform. Otherwise, fall back
+  // to the original decoder
+  //
+
+  // if (FastHufDecoder::enabled() && nBits > 128)
+  //{
+  //    FastHufDecoder fhd (ptr, nCompressed - (ptr - compressed), im, iM, iM);
+  //    fhd.decode ((unsigned char*)ptr, nBits, raw, nRaw);
+  //}
+  // else
+  {
+    std::vector<long long> freq(HUF_ENCSIZE);
+    std::vector<HufDec> hdec(HUF_DECSIZE);
+
+    hufClearDecTable(&hdec.at(0));
+
+    hufUnpackEncTable(&ptr, nCompressed - (ptr - compressed), im, iM,
+                      &freq.at(0));
+
+    {
+      if (nBits > 8 * (nCompressed - (ptr - compressed))) {
+        return false;
+      }
+
+      hufBuildDecTable(&freq.at(0), im, iM, &hdec.at(0));
+      hufDecode(&freq.at(0), &hdec.at(0), ptr, nBits, iM, raw->size(),
+                raw->data());
+    }
+    // catch (...)
+    //{
+    //    hufFreeDecTable (hdec);
+    //    throw;
+    //}
+
+    hufFreeDecTable(&hdec.at(0));
+  }
+
+  return true;
+}
+
+//
+// Functions to compress the range of values in the pixel data
+//
+
+const int USHORT_RANGE = (1 << 16);
+const int BITMAP_SIZE = (USHORT_RANGE >> 3);
+
+static void bitmapFromData(const unsigned short data[/*nData*/], int nData,
+                           unsigned char bitmap[BITMAP_SIZE],
+                           unsigned short &minNonZero,
+                           unsigned short &maxNonZero) {
+  for (int i = 0; i < BITMAP_SIZE; ++i) bitmap[i] = 0;
+
+  for (int i = 0; i < nData; ++i) bitmap[data[i] >> 3] |= (1 << (data[i] & 7));
+
+  bitmap[0] &= ~1;  // zero is not explicitly stored in
+                    // the bitmap; we assume that the
+                    // data always contain zeroes
+  minNonZero = BITMAP_SIZE - 1;
+  maxNonZero = 0;
+
+  for (int i = 0; i < BITMAP_SIZE; ++i) {
+    if (bitmap[i]) {
+      if (minNonZero > i) minNonZero = i;
+      if (maxNonZero < i) maxNonZero = i;
+    }
+  }
+}
+
+static unsigned short forwardLutFromBitmap(
+    const unsigned char bitmap[BITMAP_SIZE], unsigned short lut[USHORT_RANGE]) {
+  int k = 0;
+
+  for (int i = 0; i < USHORT_RANGE; ++i) {
+    if ((i == 0) || (bitmap[i >> 3] & (1 << (i & 7))))
+      lut[i] = k++;
+    else
+      lut[i] = 0;
+  }
+
+  return k - 1;  // maximum value stored in lut[],
+}  // i.e. number of ones in bitmap minus 1
+
+static unsigned short reverseLutFromBitmap(
+    const unsigned char bitmap[BITMAP_SIZE], unsigned short lut[USHORT_RANGE]) {
+  int k = 0;
+
+  for (int i = 0; i < USHORT_RANGE; ++i) {
+    if ((i == 0) || (bitmap[i >> 3] & (1 << (i & 7)))) lut[k++] = i;
+  }
+
+  int n = k - 1;
+
+  while (k < USHORT_RANGE) lut[k++] = 0;
+
+  return n;  // maximum k where lut[k] is non-zero,
+}  // i.e. number of ones in bitmap minus 1
+
+static void applyLut(const unsigned short lut[USHORT_RANGE],
+                     unsigned short data[/*nData*/], int nData) {
+  for (int i = 0; i < nData; ++i) data[i] = lut[data[i]];
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif  // __clang__
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+static bool CompressPiz(unsigned char *outPtr, unsigned int *outSize,
+                        const unsigned char *inPtr, size_t inSize,
+                        const std::vector<ChannelInfo> &channelInfo,
+                        int data_width, int num_lines) {
+  std::vector<unsigned char> bitmap(BITMAP_SIZE);
+  unsigned short minNonZero;
+  unsigned short maxNonZero;
+
+#if !TINYEXR_LITTLE_ENDIAN
+  // @todo { PIZ compression on BigEndian architecture. }
+  return false;
+#endif
+
+  // Assume `inSize` is multiple of 2 or 4.
+  std::vector<unsigned short> tmpBuffer(inSize / sizeof(unsigned short));
+
+  std::vector<PIZChannelData> channelData(channelInfo.size());
+  unsigned short *tmpBufferEnd = &tmpBuffer.at(0);
+
+  for (size_t c = 0; c < channelData.size(); c++) {
+    PIZChannelData &cd = channelData[c];
+
+    cd.start = tmpBufferEnd;
+    cd.end = cd.start;
+
+    cd.nx = data_width;
+    cd.ny = num_lines;
+    // cd.ys = c.channel().ySampling;
+
+    size_t pixelSize = sizeof(int);  // UINT and FLOAT
+    if (channelInfo[c].requested_pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      pixelSize = sizeof(short);
+    }
+
+    cd.size = static_cast<int>(pixelSize / sizeof(short));
+
+    tmpBufferEnd += cd.nx * cd.ny * cd.size;
+  }
+
+  const unsigned char *ptr = inPtr;
+  for (int y = 0; y < num_lines; ++y) {
+    for (size_t i = 0; i < channelData.size(); ++i) {
+      PIZChannelData &cd = channelData[i];
+
+      // if (modp (y, cd.ys) != 0)
+      //    continue;
+
+      size_t n = static_cast<size_t>(cd.nx * cd.size);
+      memcpy(cd.end, ptr, n * sizeof(unsigned short));
+      ptr += n * sizeof(unsigned short);
+      cd.end += n;
+    }
+  }
+
+  bitmapFromData(&tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()),
+                 bitmap.data(), minNonZero, maxNonZero);
+
+  std::vector<unsigned short> lut(USHORT_RANGE);
+  unsigned short maxValue = forwardLutFromBitmap(bitmap.data(), lut.data());
+  applyLut(lut.data(), &tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()));
+
+  //
+  // Store range compression info in _outBuffer
+  //
+
+  char *buf = reinterpret_cast<char *>(outPtr);
+
+  memcpy(buf, &minNonZero, sizeof(unsigned short));
+  buf += sizeof(unsigned short);
+  memcpy(buf, &maxNonZero, sizeof(unsigned short));
+  buf += sizeof(unsigned short);
+
+  if (minNonZero <= maxNonZero) {
+    memcpy(buf, reinterpret_cast<char *>(&bitmap[0] + minNonZero),
+           maxNonZero - minNonZero + 1);
+    buf += maxNonZero - minNonZero + 1;
+  }
+
+  //
+  // Apply wavelet encoding
+  //
+
+  for (size_t i = 0; i < channelData.size(); ++i) {
+    PIZChannelData &cd = channelData[i];
+
+    for (int j = 0; j < cd.size; ++j) {
+      wav2Encode(cd.start + j, cd.nx, cd.size, cd.ny, cd.nx * cd.size,
+                 maxValue);
+    }
+  }
+
+  //
+  // Apply Huffman encoding; append the result to _outBuffer
+  //
+
+  // length header(4byte), then huff data. Initialize length header with zero,
+  // then later fill it by `length`.
+  char *lengthPtr = buf;
+  int zero = 0;
+  memcpy(buf, &zero, sizeof(int));
+  buf += sizeof(int);
+
+  int length =
+      hufCompress(&tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()), buf);
+  memcpy(lengthPtr, &length, sizeof(int));
+
+  (*outSize) = static_cast<unsigned int>(
+      (reinterpret_cast<unsigned char *>(buf) - outPtr) +
+      static_cast<unsigned int>(length));
+
+  // Use uncompressed data when compressed data is larger than uncompressed.
+  // (Issue 40)
+  if ((*outSize) >= inSize) {
+    (*outSize) = static_cast<unsigned int>(inSize);
+    memcpy(outPtr, inPtr, inSize);
+  }
+  return true;
+}
+
+static bool DecompressPiz(unsigned char *outPtr, const unsigned char *inPtr,
+                          size_t tmpBufSizeInBytes, size_t inLen, int num_channels,
+                          const EXRChannelInfo *channels, int data_width,
+                          int num_lines) {
+  if (inLen == tmpBufSizeInBytes) {
+    // Data is not compressed(Issue 40).
+    memcpy(outPtr, inPtr, inLen);
+    return true;
+  }
+
+  std::vector<unsigned char> bitmap(BITMAP_SIZE);
+  unsigned short minNonZero;
+  unsigned short maxNonZero;
+
+#if !TINYEXR_LITTLE_ENDIAN
+  // @todo { PIZ compression on BigEndian architecture. }
+  return false;
+#endif
+
+  memset(bitmap.data(), 0, BITMAP_SIZE);
+
+  if (inLen < 4) {
+    return false;
+  }
+
+  size_t readLen = 0;
+
+  const unsigned char *ptr = inPtr;
+  // minNonZero = *(reinterpret_cast<const unsigned short *>(ptr));
+  tinyexr::cpy2(&minNonZero, reinterpret_cast<const unsigned short *>(ptr));
+  // maxNonZero = *(reinterpret_cast<const unsigned short *>(ptr + 2));
+  tinyexr::cpy2(&maxNonZero, reinterpret_cast<const unsigned short *>(ptr + 2));
+  ptr += 4;
+  readLen += 4;
+
+  if (maxNonZero >= BITMAP_SIZE) {
+    return false;
+  }
+
+  //printf("maxNonZero = %d\n", maxNonZero);
+  //printf("minNonZero = %d\n", minNonZero);
+  //printf("len = %d\n", (maxNonZero - minNonZero + 1));
+  //printf("BITMAPSIZE - min = %d\n", (BITMAP_SIZE - minNonZero));
+
+  if (minNonZero <= maxNonZero) {
+    if (((maxNonZero - minNonZero + 1) + readLen) > inLen) {
+      // Input too short
+      return false;
+    }
+
+    memcpy(reinterpret_cast<char *>(&bitmap[0] + minNonZero), ptr,
+           maxNonZero - minNonZero + 1);
+    ptr += maxNonZero - minNonZero + 1;
+    readLen += maxNonZero - minNonZero + 1;
+  } else {
+    // Issue 194
+    if ((minNonZero == (BITMAP_SIZE - 1)) && (maxNonZero == 0)) {
+      // OK. all pixels are zero. And no need to read `bitmap` data.
+    } else {
+      // invalid minNonZero/maxNonZero combination.
+      return false;
+    }
+  }
+
+  std::vector<unsigned short> lut(USHORT_RANGE);
+  memset(lut.data(), 0, sizeof(unsigned short) * USHORT_RANGE);
+  unsigned short maxValue = reverseLutFromBitmap(bitmap.data(), lut.data());
+
+  //
+  // Huffman decoding
+  //
+
+  if ((readLen + 4) > inLen) {
+    return false;
+  }
+
+  int length=0;
+
+  // length = *(reinterpret_cast<const int *>(ptr));
+  tinyexr::cpy4(&length, reinterpret_cast<const int *>(ptr));
+  ptr += sizeof(int);
+
+  if (size_t((ptr - inPtr) + length) > inLen) {
+    return false;
+  }
+
+  std::vector<unsigned short> tmpBuffer(tmpBufSizeInBytes / sizeof(unsigned short));
+  hufUncompress(reinterpret_cast<const char *>(ptr), length, &tmpBuffer);
+
+  //
+  // Wavelet decoding
+  //
+
+  std::vector<PIZChannelData> channelData(static_cast<size_t>(num_channels));
+
+  unsigned short *tmpBufferEnd = &tmpBuffer.at(0);
+
+  for (size_t i = 0; i < static_cast<size_t>(num_channels); ++i) {
+    const EXRChannelInfo &chan = channels[i];
+
+    size_t pixelSize = sizeof(int);  // UINT and FLOAT
+    if (chan.pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      pixelSize = sizeof(short);
+    }
+
+    channelData[i].start = tmpBufferEnd;
+    channelData[i].end = channelData[i].start;
+    channelData[i].nx = data_width;
+    channelData[i].ny = num_lines;
+    // channelData[i].ys = 1;
+    channelData[i].size = static_cast<int>(pixelSize / sizeof(short));
+
+    tmpBufferEnd += channelData[i].nx * channelData[i].ny * channelData[i].size;
+  }
+
+  for (size_t i = 0; i < channelData.size(); ++i) {
+    PIZChannelData &cd = channelData[i];
+
+    for (int j = 0; j < cd.size; ++j) {
+      wav2Decode(cd.start + j, cd.nx, cd.size, cd.ny, cd.nx * cd.size,
+                 maxValue);
+    }
+  }
+
+  //
+  // Expand the pixel data to their original range
+  //
+
+  applyLut(lut.data(), &tmpBuffer.at(0), static_cast<int>(tmpBufSizeInBytes / sizeof(unsigned short)));
+
+  for (int y = 0; y < num_lines; y++) {
+    for (size_t i = 0; i < channelData.size(); ++i) {
+      PIZChannelData &cd = channelData[i];
+
+      // if (modp (y, cd.ys) != 0)
+      //    continue;
+
+      size_t n = static_cast<size_t>(cd.nx * cd.size);
+      memcpy(outPtr, cd.end, static_cast<size_t>(n * sizeof(unsigned short)));
+      outPtr += n * sizeof(unsigned short);
+      cd.end += n;
+    }
+  }
+
+  return true;
+}
+#endif  // TINYEXR_USE_PIZ
+
+#if TINYEXR_USE_ZFP
+
+struct ZFPCompressionParam {
+  double rate;
+  unsigned int precision;
+  unsigned int __pad0;
+  double tolerance;
+  int type;  // TINYEXR_ZFP_COMPRESSIONTYPE_*
+  unsigned int __pad1;
+
+  ZFPCompressionParam() {
+    type = TINYEXR_ZFP_COMPRESSIONTYPE_RATE;
+    rate = 2.0;
+    precision = 0;
+    tolerance = 0.0;
+  }
+};
+
+static bool FindZFPCompressionParam(ZFPCompressionParam *param,
+                                    const EXRAttribute *attributes,
+                                    int num_attributes, std::string *err) {
+  bool foundType = false;
+
+  for (int i = 0; i < num_attributes; i++) {
+    if ((strcmp(attributes[i].name, "zfpCompressionType") == 0)) {
+      if (attributes[i].size == 1) {
+        param->type = static_cast<int>(attributes[i].value[0]);
+        foundType = true;
+        break;
+      } else {
+        if (err) {
+          (*err) +=
+              "zfpCompressionType attribute must be uchar(1 byte) type.\n";
+        }
+        return false;
+      }
+    }
+  }
+
+  if (!foundType) {
+    if (err) {
+      (*err) += "`zfpCompressionType` attribute not found.\n";
+    }
+    return false;
+  }
+
+  if (param->type == TINYEXR_ZFP_COMPRESSIONTYPE_RATE) {
+    for (int i = 0; i < num_attributes; i++) {
+      if ((strcmp(attributes[i].name, "zfpCompressionRate") == 0) &&
+          (attributes[i].size == 8)) {
+        param->rate = *(reinterpret_cast<double *>(attributes[i].value));
+        return true;
+      }
+    }
+
+    if (err) {
+      (*err) += "`zfpCompressionRate` attribute not found.\n";
+    }
+
+  } else if (param->type == TINYEXR_ZFP_COMPRESSIONTYPE_PRECISION) {
+    for (int i = 0; i < num_attributes; i++) {
+      if ((strcmp(attributes[i].name, "zfpCompressionPrecision") == 0) &&
+          (attributes[i].size == 4)) {
+        param->rate = *(reinterpret_cast<int *>(attributes[i].value));
+        return true;
+      }
+    }
+
+    if (err) {
+      (*err) += "`zfpCompressionPrecision` attribute not found.\n";
+    }
+
+  } else if (param->type == TINYEXR_ZFP_COMPRESSIONTYPE_ACCURACY) {
+    for (int i = 0; i < num_attributes; i++) {
+      if ((strcmp(attributes[i].name, "zfpCompressionTolerance") == 0) &&
+          (attributes[i].size == 8)) {
+        param->tolerance = *(reinterpret_cast<double *>(attributes[i].value));
+        return true;
+      }
+    }
+
+    if (err) {
+      (*err) += "`zfpCompressionTolerance` attribute not found.\n";
+    }
+  } else {
+    if (err) {
+      (*err) += "Unknown value specified for `zfpCompressionType`.\n";
+    }
+  }
+
+  return false;
+}
+
+// Assume pixel format is FLOAT for all channels.
+static bool DecompressZfp(float *dst, int dst_width, int dst_num_lines,
+                          size_t num_channels, const unsigned char *src,
+                          unsigned long src_size,
+                          const ZFPCompressionParam &param) {
+  size_t uncompressed_size =
+      size_t(dst_width) * size_t(dst_num_lines) * num_channels;
+
+  if (uncompressed_size == src_size) {
+    // Data is not compressed(Issue 40).
+    memcpy(dst, src, src_size);
+  }
+
+  zfp_stream *zfp = NULL;
+  zfp_field *field = NULL;
+
+  TINYEXR_CHECK_AND_RETURN_C((dst_width % 4) == 0, false);
+  TINYEXR_CHECK_AND_RETURN_C((dst_num_lines % 4) == 0, false);
+
+  if ((size_t(dst_width) & 3U) || (size_t(dst_num_lines) & 3U)) {
+    return false;
+  }
+
+  field =
+      zfp_field_2d(reinterpret_cast<void *>(const_cast<unsigned char *>(src)),
+                   zfp_type_float, static_cast<unsigned int>(dst_width),
+                   static_cast<unsigned int>(dst_num_lines) *
+                       static_cast<unsigned int>(num_channels));
+  zfp = zfp_stream_open(NULL);
+
+  if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_RATE) {
+    zfp_stream_set_rate(zfp, param.rate, zfp_type_float, /* dimension */ 2,
+                        /* write random access */ 0);
+  } else if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_PRECISION) {
+    zfp_stream_set_precision(zfp, param.precision);
+  } else if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_ACCURACY) {
+    zfp_stream_set_accuracy(zfp, param.tolerance);
+  } else {
+    return false;
+  }
+
+  size_t buf_size = zfp_stream_maximum_size(zfp, field);
+  std::vector<unsigned char> buf(buf_size);
+  memcpy(&buf.at(0), src, src_size);
+
+  bitstream *stream = stream_open(&buf.at(0), buf_size);
+  zfp_stream_set_bit_stream(zfp, stream);
+  zfp_stream_rewind(zfp);
+
+  size_t image_size = size_t(dst_width) * size_t(dst_num_lines);
+
+  for (size_t c = 0; c < size_t(num_channels); c++) {
+    // decompress 4x4 pixel block.
+    for (size_t y = 0; y < size_t(dst_num_lines); y += 4) {
+      for (size_t x = 0; x < size_t(dst_width); x += 4) {
+        float fblock[16];
+        zfp_decode_block_float_2(zfp, fblock);
+        for (size_t j = 0; j < 4; j++) {
+          for (size_t i = 0; i < 4; i++) {
+            dst[c * image_size + ((y + j) * size_t(dst_width) + (x + i))] =
+                fblock[j * 4 + i];
+          }
+        }
+      }
+    }
+  }
+
+  zfp_field_free(field);
+  zfp_stream_close(zfp);
+  stream_close(stream);
+
+  return true;
+}
+
+// Assume pixel format is FLOAT for all channels.
+static bool CompressZfp(std::vector<unsigned char> *outBuf,
+                        unsigned int *outSize, const float *inPtr, int width,
+                        int num_lines, int num_channels,
+                        const ZFPCompressionParam &param) {
+  zfp_stream *zfp = NULL;
+  zfp_field *field = NULL;
+
+  TINYEXR_CHECK_AND_RETURN_C((width % 4) == 0, false);
+  TINYEXR_CHECK_AND_RETURN_C((num_lines % 4) == 0, false);
+
+  if ((size_t(width) & 3U) || (size_t(num_lines) & 3U)) {
+    return false;
+  }
+
+  // create input array.
+  field = zfp_field_2d(reinterpret_cast<void *>(const_cast<float *>(inPtr)),
+                       zfp_type_float, static_cast<unsigned int>(width),
+                       static_cast<unsigned int>(num_lines * num_channels));
+
+  zfp = zfp_stream_open(NULL);
+
+  if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_RATE) {
+    zfp_stream_set_rate(zfp, param.rate, zfp_type_float, 2, 0);
+  } else if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_PRECISION) {
+    zfp_stream_set_precision(zfp, param.precision);
+  } else if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_ACCURACY) {
+    zfp_stream_set_accuracy(zfp, param.tolerance);
+  } else {
+    return false;
+  }
+
+  size_t buf_size = zfp_stream_maximum_size(zfp, field);
+
+  outBuf->resize(buf_size);
+
+  bitstream *stream = stream_open(&outBuf->at(0), buf_size);
+  zfp_stream_set_bit_stream(zfp, stream);
+  zfp_field_free(field);
+
+  size_t image_size = size_t(width) * size_t(num_lines);
+
+  for (size_t c = 0; c < size_t(num_channels); c++) {
+    // compress 4x4 pixel block.
+    for (size_t y = 0; y < size_t(num_lines); y += 4) {
+      for (size_t x = 0; x < size_t(width); x += 4) {
+        float fblock[16];
+        for (size_t j = 0; j < 4; j++) {
+          for (size_t i = 0; i < 4; i++) {
+            fblock[j * 4 + i] =
+                inPtr[c * image_size + ((y + j) * size_t(width) + (x + i))];
+          }
+        }
+        zfp_encode_block_float_2(zfp, fblock);
+      }
+    }
+  }
+
+  zfp_stream_flush(zfp);
+  (*outSize) = static_cast<unsigned int>(zfp_stream_compressed_size(zfp));
+
+  zfp_stream_close(zfp);
+
+  return true;
+}
+
+#endif
+
+//
+// -----------------------------------------------------------------
+//
+
+// heuristics
+#define TINYEXR_DIMENSION_THRESHOLD (1024 * 8192)
+
+// TODO(syoyo): Refactor function arguments.
+static bool DecodePixelData(/* out */ unsigned char **out_images,
+                            const int *requested_pixel_types,
+                            const unsigned char *data_ptr, size_t data_len,
+                            int compression_type, int line_order, int width,
+                            int height, int x_stride, int y, int line_no,
+                            int num_lines, size_t pixel_data_size,
+                            size_t num_attributes,
+                            const EXRAttribute *attributes, size_t num_channels,
+                            const EXRChannelInfo *channels,
+                            const std::vector<size_t> &channel_offset_list) {
+  if (compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {  // PIZ
+#if TINYEXR_USE_PIZ
+    if ((width == 0) || (num_lines == 0) || (pixel_data_size == 0)) {
+      // Invalid input #90
+      return false;
+    }
+
+    // Allocate original data size.
+    std::vector<unsigned char> outBuf(static_cast<size_t>(
+        static_cast<size_t>(width * num_lines) * pixel_data_size));
+    size_t tmpBufLen = outBuf.size();
+
+    bool ret = tinyexr::DecompressPiz(
+        reinterpret_cast<unsigned char *>(&outBuf.at(0)), data_ptr, tmpBufLen,
+        data_len, static_cast<int>(num_channels), channels, width, num_lines);
+
+    if (!ret) {
+      return false;
+    }
+
+    // For PIZ_COMPRESSION:
+    //   pixel sample data for channel 0 for scanline 0
+    //   pixel sample data for channel 1 for scanline 0
+    //   pixel sample data for channel ... for scanline 0
+    //   pixel sample data for channel n for scanline 0
+    //   pixel sample data for channel 0 for scanline 1
+    //   pixel sample data for channel 1 for scanline 1
+    //   pixel sample data for channel ... for scanline 1
+    //   pixel sample data for channel n for scanline 1
+    //   ...
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            FP16 hf;
+
+            // hf.u = line_ptr[u];
+            // use `cpy` to avoid unaligned memory access when compiler's
+            // optimization is on.
+            tinyexr::cpy2(&(hf.u), line_ptr + u);
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+            if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+              unsigned short *image =
+                  reinterpret_cast<unsigned short **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += static_cast<size_t>(
+                             (height - 1 - (line_no + static_cast<int>(v)))) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = hf.u;
+            } else {  // HALF -> FLOAT
+              FP32 f32 = half_to_float(hf);
+              float *image = reinterpret_cast<float **>(out_images)[c];
+              size_t offset = 0;
+              if (line_order == 0) {
+                offset = (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                offset = static_cast<size_t>(
+                             (height - 1 - (line_no + static_cast<int>(v)))) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              image += offset;
+              *image = f32.f;
+            }
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+        TINYEXR_CHECK_AND_RETURN_C(requested_pixel_types[c] == TINYEXR_PIXELTYPE_UINT, false);
+
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned int *line_ptr = reinterpret_cast<unsigned int *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            unsigned int val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(&val);
+
+            unsigned int *image =
+                reinterpret_cast<unsigned int **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += static_cast<size_t>(
+                           (height - 1 - (line_no + static_cast<int>(v)))) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        TINYEXR_CHECK_AND_RETURN_C(requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT, false);
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const float *line_ptr = reinterpret_cast<float *>(&outBuf.at(
+              v * pixel_data_size * static_cast<size_t>(width) +
+              channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            float val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            float *image = reinterpret_cast<float **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += static_cast<size_t>(
+                           (height - 1 - (line_no + static_cast<int>(v)))) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else {
+        return false;
+      }
+    }
+#else
+    return false;
+#endif
+
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_ZIPS ||
+             compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+    // Allocate original data size.
+    std::vector<unsigned char> outBuf(static_cast<size_t>(width) *
+                                      static_cast<size_t>(num_lines) *
+                                      pixel_data_size);
+
+    unsigned long dstLen = static_cast<unsigned long>(outBuf.size());
+    TINYEXR_CHECK_AND_RETURN_C(dstLen > 0, false);
+    if (!tinyexr::DecompressZip(
+            reinterpret_cast<unsigned char *>(&outBuf.at(0)), &dstLen, data_ptr,
+            static_cast<unsigned long>(data_len))) {
+      return false;
+    }
+
+    // For ZIP_COMPRESSION:
+    //   pixel sample data for channel 0 for scanline 0
+    //   pixel sample data for channel 1 for scanline 0
+    //   pixel sample data for channel ... for scanline 0
+    //   pixel sample data for channel n for scanline 0
+    //   pixel sample data for channel 0 for scanline 1
+    //   pixel sample data for channel 1 for scanline 1
+    //   pixel sample data for channel ... for scanline 1
+    //   pixel sample data for channel n for scanline 1
+    //   ...
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+              &outBuf.at(v * static_cast<size_t>(pixel_data_size) *
+                             static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            tinyexr::FP16 hf;
+
+            // hf.u = line_ptr[u];
+            tinyexr::cpy2(&(hf.u), line_ptr + u);
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+            if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+              unsigned short *image =
+                  reinterpret_cast<unsigned short **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += (static_cast<size_t>(height) - 1U -
+                          (static_cast<size_t>(line_no) + v)) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = hf.u;
+            } else {  // HALF -> FLOAT
+              tinyexr::FP32 f32 = half_to_float(hf);
+              float *image = reinterpret_cast<float **>(out_images)[c];
+              size_t offset = 0;
+              if (line_order == 0) {
+                offset = (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                offset = (static_cast<size_t>(height) - 1U -
+                          (static_cast<size_t>(line_no) + v)) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              image += offset;
+
+              *image = f32.f;
+            }
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+        TINYEXR_CHECK_AND_RETURN_C(requested_pixel_types[c] == TINYEXR_PIXELTYPE_UINT, false);
+
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned int *line_ptr = reinterpret_cast<unsigned int *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            unsigned int val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(&val);
+
+            unsigned int *image =
+                reinterpret_cast<unsigned int **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        TINYEXR_CHECK_AND_RETURN_C(requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT, false);
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const float *line_ptr = reinterpret_cast<float *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            float val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            float *image = reinterpret_cast<float **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else {
+        return false;
+      }
+    }
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_RLE) {
+    // Allocate original data size.
+    std::vector<unsigned char> outBuf(static_cast<size_t>(width) *
+                                      static_cast<size_t>(num_lines) *
+                                      pixel_data_size);
+
+    unsigned long dstLen = static_cast<unsigned long>(outBuf.size());
+    if (dstLen == 0) {
+      return false;
+    }
+
+    if (!tinyexr::DecompressRle(
+            reinterpret_cast<unsigned char *>(&outBuf.at(0)), dstLen, data_ptr,
+            static_cast<unsigned long>(data_len))) {
+      return false;
+    }
+
+    // For RLE_COMPRESSION:
+    //   pixel sample data for channel 0 for scanline 0
+    //   pixel sample data for channel 1 for scanline 0
+    //   pixel sample data for channel ... for scanline 0
+    //   pixel sample data for channel n for scanline 0
+    //   pixel sample data for channel 0 for scanline 1
+    //   pixel sample data for channel 1 for scanline 1
+    //   pixel sample data for channel ... for scanline 1
+    //   pixel sample data for channel n for scanline 1
+    //   ...
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+              &outBuf.at(v * static_cast<size_t>(pixel_data_size) *
+                             static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            tinyexr::FP16 hf;
+
+            // hf.u = line_ptr[u];
+            tinyexr::cpy2(&(hf.u), line_ptr + u);
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+            if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+              unsigned short *image =
+                  reinterpret_cast<unsigned short **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += (static_cast<size_t>(height) - 1U -
+                          (static_cast<size_t>(line_no) + v)) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = hf.u;
+            } else {  // HALF -> FLOAT
+              tinyexr::FP32 f32 = half_to_float(hf);
+              float *image = reinterpret_cast<float **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += (static_cast<size_t>(height) - 1U -
+                          (static_cast<size_t>(line_no) + v)) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = f32.f;
+            }
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+        TINYEXR_CHECK_AND_RETURN_C(requested_pixel_types[c] == TINYEXR_PIXELTYPE_UINT, false);
+
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned int *line_ptr = reinterpret_cast<unsigned int *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            unsigned int val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(&val);
+
+            unsigned int *image =
+                reinterpret_cast<unsigned int **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        TINYEXR_CHECK_AND_RETURN_C(requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT, false);
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const float *line_ptr = reinterpret_cast<float *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            float val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            float *image = reinterpret_cast<float **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else {
+        return false;
+      }
+    }
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+#if TINYEXR_USE_ZFP
+    tinyexr::ZFPCompressionParam zfp_compression_param;
+    std::string e;
+    if (!tinyexr::FindZFPCompressionParam(&zfp_compression_param, attributes,
+                                          int(num_attributes), &e)) {
+      // This code path should not be reachable.
+      return false;
+    }
+
+    // Allocate original data size.
+    std::vector<unsigned char> outBuf(static_cast<size_t>(width) *
+                                      static_cast<size_t>(num_lines) *
+                                      pixel_data_size);
+
+    unsigned long dstLen = outBuf.size();
+    TINYEXR_CHECK_AND_RETURN_C(dstLen > 0, false);
+    tinyexr::DecompressZfp(reinterpret_cast<float *>(&outBuf.at(0)), width,
+                           num_lines, num_channels, data_ptr,
+                           static_cast<unsigned long>(data_len),
+                           zfp_compression_param);
+
+    // For ZFP_COMPRESSION:
+    //   pixel sample data for channel 0 for scanline 0
+    //   pixel sample data for channel 1 for scanline 0
+    //   pixel sample data for channel ... for scanline 0
+    //   pixel sample data for channel n for scanline 0
+    //   pixel sample data for channel 0 for scanline 1
+    //   pixel sample data for channel 1 for scanline 1
+    //   pixel sample data for channel ... for scanline 1
+    //   pixel sample data for channel n for scanline 1
+    //   ...
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      TINYEXR_CHECK_AND_RETURN_C(channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT, false);
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        TINYEXR_CHECK_AND_RETURN_C(requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT, false);
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const float *line_ptr = reinterpret_cast<float *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            float val;
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            float *image = reinterpret_cast<float **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else {
+        return false;
+      }
+    }
+#else
+    (void)attributes;
+    (void)num_attributes;
+    (void)num_channels;
+    return false;
+#endif
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_NONE) {
+    for (size_t c = 0; c < num_channels; c++) {
+      for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+        if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+          const unsigned short *line_ptr =
+              reinterpret_cast<const unsigned short *>(
+                  data_ptr + v * pixel_data_size * size_t(width) +
+                  channel_offset_list[c] * static_cast<size_t>(width));
+
+          if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+            unsigned short *outLine =
+                reinterpret_cast<unsigned short *>(out_images[c]);
+            if (line_order == 0) {
+              outLine += (size_t(y) + v) * size_t(x_stride);
+            } else {
+              outLine +=
+                  (size_t(height) - 1 - (size_t(y) + v)) * size_t(x_stride);
+            }
+
+            for (int u = 0; u < width; u++) {
+              tinyexr::FP16 hf;
+
+              // hf.u = line_ptr[u];
+              tinyexr::cpy2(&(hf.u), line_ptr + u);
+
+              tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+              outLine[u] = hf.u;
+            }
+          } else if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT) {
+            float *outLine = reinterpret_cast<float *>(out_images[c]);
+            if (line_order == 0) {
+              outLine += (size_t(y) + v) * size_t(x_stride);
+            } else {
+              outLine +=
+                  (size_t(height) - 1 - (size_t(y) + v)) * size_t(x_stride);
+            }
+
+            if (reinterpret_cast<const unsigned char *>(line_ptr + width) >
+                (data_ptr + data_len)) {
+              // Insufficient data size
+              return false;
+            }
+
+            for (int u = 0; u < width; u++) {
+              tinyexr::FP16 hf;
+
+              // address may not be aligned. use byte-wise copy for safety.#76
+              // hf.u = line_ptr[u];
+              tinyexr::cpy2(&(hf.u), line_ptr + u);
+
+              tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+              tinyexr::FP32 f32 = half_to_float(hf);
+
+              outLine[u] = f32.f;
+            }
+          } else {
+            return false;
+          }
+        } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+          const float *line_ptr = reinterpret_cast<const float *>(
+              data_ptr + v * pixel_data_size * size_t(width) +
+              channel_offset_list[c] * static_cast<size_t>(width));
+
+          float *outLine = reinterpret_cast<float *>(out_images[c]);
+          if (line_order == 0) {
+            outLine += (size_t(y) + v) * size_t(x_stride);
+          } else {
+            outLine +=
+                (size_t(height) - 1 - (size_t(y) + v)) * size_t(x_stride);
+          }
+
+          if (reinterpret_cast<const unsigned char *>(line_ptr + width) >
+              (data_ptr + data_len)) {
+            // Insufficient data size
+            return false;
+          }
+
+          for (int u = 0; u < width; u++) {
+            float val;
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            outLine[u] = val;
+          }
+        } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+          const unsigned int *line_ptr = reinterpret_cast<const unsigned int *>(
+              data_ptr + v * pixel_data_size * size_t(width) +
+              channel_offset_list[c] * static_cast<size_t>(width));
+
+          unsigned int *outLine =
+              reinterpret_cast<unsigned int *>(out_images[c]);
+          if (line_order == 0) {
+            outLine += (size_t(y) + v) * size_t(x_stride);
+          } else {
+            outLine +=
+                (size_t(height) - 1 - (size_t(y) + v)) * size_t(x_stride);
+          }
+
+          if (reinterpret_cast<const unsigned char *>(line_ptr + width) >
+              (data_ptr + data_len)) {
+            // Corrupted data
+            return false;
+          }
+
+          for (int u = 0; u < width; u++) {
+
+            unsigned int val;
+            tinyexr::cpy4(&val, line_ptr + u);
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            outLine[u] = val;
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool DecodeTiledPixelData(
+    unsigned char **out_images, int *width, int *height,
+    const int *requested_pixel_types, const unsigned char *data_ptr,
+    size_t data_len, int compression_type, int line_order, int data_width,
+    int data_height, int tile_offset_x, int tile_offset_y, int tile_size_x,
+    int tile_size_y, size_t pixel_data_size, size_t num_attributes,
+    const EXRAttribute *attributes, size_t num_channels,
+    const EXRChannelInfo *channels,
+    const std::vector<size_t> &channel_offset_list) {
+  // Here, data_width and data_height are the dimensions of the current (sub)level.
+  if (tile_size_x * tile_offset_x > data_width ||
+      tile_size_y * tile_offset_y > data_height) {
+    return false;
+  }
+
+  // Compute actual image size in a tile.
+  if ((tile_offset_x + 1) * tile_size_x >= data_width) {
+    (*width) = data_width - (tile_offset_x * tile_size_x);
+  } else {
+    (*width) = tile_size_x;
+  }
+
+  if ((tile_offset_y + 1) * tile_size_y >= data_height) {
+    (*height) = data_height - (tile_offset_y * tile_size_y);
+  } else {
+    (*height) = tile_size_y;
+  }
+
+  // Image size = tile size.
+  return DecodePixelData(out_images, requested_pixel_types, data_ptr, data_len,
+                         compression_type, line_order, (*width), tile_size_y,
+                         /* stride */ tile_size_x, /* y */ 0, /* line_no */ 0,
+                         (*height), pixel_data_size, num_attributes, attributes,
+                         num_channels, channels, channel_offset_list);
+}
+
+static bool ComputeChannelLayout(std::vector<size_t> *channel_offset_list,
+                                 int *pixel_data_size, size_t *channel_offset,
+                                 int num_channels,
+                                 const EXRChannelInfo *channels) {
+  channel_offset_list->resize(static_cast<size_t>(num_channels));
+
+  (*pixel_data_size) = 0;
+  (*channel_offset) = 0;
+
+  for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+    (*channel_offset_list)[c] = (*channel_offset);
+    if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      (*pixel_data_size) += sizeof(unsigned short);
+      (*channel_offset) += sizeof(unsigned short);
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+      (*pixel_data_size) += sizeof(float);
+      (*channel_offset) += sizeof(float);
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+      (*pixel_data_size) += sizeof(unsigned int);
+      (*channel_offset) += sizeof(unsigned int);
+    } else {
+      // ???
+      return false;
+    }
+  }
+  return true;
+}
+
+// TODO: Simply return nullptr when failed to allocate?
+static unsigned char **AllocateImage(int num_channels,
+                                     const EXRChannelInfo *channels,
+                                     const int *requested_pixel_types,
+                                     int data_width, int data_height, bool *success) {
+  unsigned char **images =
+      reinterpret_cast<unsigned char **>(static_cast<float **>(
+          malloc(sizeof(float *) * static_cast<size_t>(num_channels))));
+
+  for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+    images[c] = NULL;
+  }
+
+  bool valid = true;
+
+  for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+    size_t data_len =
+        static_cast<size_t>(data_width) * static_cast<size_t>(data_height);
+    if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      // pixel_data_size += sizeof(unsigned short);
+      // channel_offset += sizeof(unsigned short);
+      // Alloc internal image for half type.
+      if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+        images[c] =
+            reinterpret_cast<unsigned char *>(static_cast<unsigned short *>(
+                malloc(sizeof(unsigned short) * data_len)));
+      } else if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT) {
+        images[c] = reinterpret_cast<unsigned char *>(
+            static_cast<float *>(malloc(sizeof(float) * data_len)));
+      } else {
+        images[c] = NULL; // just in case.
+        valid = false;
+        break;
+      }
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+      // pixel_data_size += sizeof(float);
+      // channel_offset += sizeof(float);
+      images[c] = reinterpret_cast<unsigned char *>(
+          static_cast<float *>(malloc(sizeof(float) * data_len)));
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+      // pixel_data_size += sizeof(unsigned int);
+      // channel_offset += sizeof(unsigned int);
+      images[c] = reinterpret_cast<unsigned char *>(
+          static_cast<unsigned int *>(malloc(sizeof(unsigned int) * data_len)));
+    } else {
+      images[c] = NULL; // just in case.
+      valid = false;
+      break;
+    }
+  }
+
+  if (!valid) {
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      if (images[c]) {
+        free(images[c]);
+        images[c] = NULL;
+      }
+    }
+
+    if (success) {
+      (*success) = false;
+    }
+  } else {
+    if (success) {
+      (*success) = true;
+    }
+  }
+
+  return images;
+}
+
+#ifdef _WIN32
+static inline std::wstring UTF8ToWchar(const std::string &str) {
+  int wstr_size =
+      MultiByteToWideChar(CP_UTF8, 0, str.data(), (int)str.size(), NULL, 0);
+  std::wstring wstr(wstr_size, 0);
+  MultiByteToWideChar(CP_UTF8, 0, str.data(), (int)str.size(), &wstr[0],
+                      (int)wstr.size());
+  return wstr;
+}
+#endif
+
+
+static int ParseEXRHeader(HeaderInfo *info, bool *empty_header,
+                          const EXRVersion *version, std::string *err,
+                          const unsigned char *buf, size_t size) {
+  const char *marker = reinterpret_cast<const char *>(&buf[0]);
+
+  if (empty_header) {
+    (*empty_header) = false;
+  }
+
+  if (version->multipart) {
+    if (size > 0 && marker[0] == '\0') {
+      // End of header list.
+      if (empty_header) {
+        (*empty_header) = true;
+      }
+      return TINYEXR_SUCCESS;
+    }
+  }
+
+  // According to the spec, the header of every OpenEXR file must contain at
+  // least the following attributes:
+  //
+  // channels chlist
+  // compression compression
+  // dataWindow box2i
+  // displayWindow box2i
+  // lineOrder lineOrder
+  // pixelAspectRatio float
+  // screenWindowCenter v2f
+  // screenWindowWidth float
+  bool has_channels = false;
+  bool has_compression = false;
+  bool has_data_window = false;
+  bool has_display_window = false;
+  bool has_line_order = false;
+  bool has_pixel_aspect_ratio = false;
+  bool has_screen_window_center = false;
+  bool has_screen_window_width = false;
+  bool has_name = false;
+  bool has_type = false;
+
+  info->name.clear();
+  info->type.clear();
+
+  info->data_window.min_x = 0;
+  info->data_window.min_y = 0;
+  info->data_window.max_x = 0;
+  info->data_window.max_y = 0;
+  info->line_order = 0;  // @fixme
+  info->display_window.min_x = 0;
+  info->display_window.min_y = 0;
+  info->display_window.max_x = 0;
+  info->display_window.max_y = 0;
+  info->screen_window_center[0] = 0.0f;
+  info->screen_window_center[1] = 0.0f;
+  info->screen_window_width = -1.0f;
+  info->pixel_aspect_ratio = -1.0f;
+
+  info->tiled = 0;
+  info->tile_size_x = -1;
+  info->tile_size_y = -1;
+  info->tile_level_mode = -1;
+  info->tile_rounding_mode = -1;
+
+  info->attributes.clear();
+
+  // Read attributes
+  size_t orig_size = size;
+  for (size_t nattr = 0; nattr < TINYEXR_MAX_HEADER_ATTRIBUTES; nattr++) {
+    if (0 == size) {
+      if (err) {
+        (*err) += "Insufficient data size for attributes.\n";
+      }
+      return TINYEXR_ERROR_INVALID_DATA;
+    } else if (marker[0] == '\0') {
+      size--;
+      break;
+    }
+
+    std::string attr_name;
+    std::string attr_type;
+    std::vector<unsigned char> data;
+    size_t marker_size;
+    if (!tinyexr::ReadAttribute(&attr_name, &attr_type, &data, &marker_size,
+                                marker, size)) {
+      if (err) {
+        (*err) += "Failed to read attribute.\n";
+      }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+    marker += marker_size;
+    size -= marker_size;
+
+    // For a multipart file, the version field 9th bit is 0.
+    if ((version->tiled || version->multipart || version->non_image) && attr_name.compare("tiles") == 0) {
+      unsigned int x_size, y_size;
+      unsigned char tile_mode;
+      if (data.size() != 9) {
+        if (err) {
+          (*err) += "(ParseEXRHeader) Invalid attribute data size. Attribute data size must be 9.\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      memcpy(&x_size, &data.at(0), sizeof(int));
+      memcpy(&y_size, &data.at(4), sizeof(int));
+      tile_mode = data[8];
+      tinyexr::swap4(&x_size);
+      tinyexr::swap4(&y_size);
+
+      if (x_size > static_cast<unsigned int>(std::numeric_limits<int>::max()) ||
+          y_size > static_cast<unsigned int>(std::numeric_limits<int>::max())) {
+        if (err) {
+          (*err) = "Tile sizes were invalid.";
+        }
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+      }
+
+      info->tile_size_x = static_cast<int>(x_size);
+      info->tile_size_y = static_cast<int>(y_size);
+
+      // mode = levelMode + roundingMode * 16
+      info->tile_level_mode = tile_mode & 0x3;
+      info->tile_rounding_mode = (tile_mode >> 4) & 0x1;
+      info->tiled = 1;
+    } else if (attr_name.compare("compression") == 0) {
+      bool ok = false;
+      if (data[0] < TINYEXR_COMPRESSIONTYPE_PIZ) {
+        ok = true;
+      }
+
+      if (data[0] == TINYEXR_COMPRESSIONTYPE_PIZ) {
+#if TINYEXR_USE_PIZ
+        ok = true;
+#else
+        if (err) {
+          (*err) = "PIZ compression is not supported.";
+        }
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+#endif
+      }
+
+      if (data[0] == TINYEXR_COMPRESSIONTYPE_ZFP) {
+#if TINYEXR_USE_ZFP
+        ok = true;
+#else
+        if (err) {
+          (*err) = "ZFP compression is not supported.";
+        }
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+#endif
+      }
+
+      if (!ok) {
+        if (err) {
+          (*err) = "Unknown compression type.";
+        }
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+      }
+
+      info->compression_type = static_cast<int>(data[0]);
+      has_compression = true;
+
+    } else if (attr_name.compare("channels") == 0) {
+      // name: zero-terminated string, from 1 to 255 bytes long
+      // pixel type: int, possible values are: UINT = 0 HALF = 1 FLOAT = 2
+      // pLinear: unsigned char, possible values are 0 and 1
+      // reserved: three chars, should be zero
+      // xSampling: int
+      // ySampling: int
+
+      if (!ReadChannelInfo(info->channels, data)) {
+        if (err) {
+          (*err) += "Failed to parse channel info.\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      if (info->channels.size() < 1) {
+        if (err) {
+          (*err) += "# of channels is zero.\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      has_channels = true;
+
+    } else if (attr_name.compare("dataWindow") == 0) {
+      if (data.size() >= 16) {
+        memcpy(&info->data_window.min_x, &data.at(0), sizeof(int));
+        memcpy(&info->data_window.min_y, &data.at(4), sizeof(int));
+        memcpy(&info->data_window.max_x, &data.at(8), sizeof(int));
+        memcpy(&info->data_window.max_y, &data.at(12), sizeof(int));
+        tinyexr::swap4(&info->data_window.min_x);
+        tinyexr::swap4(&info->data_window.min_y);
+        tinyexr::swap4(&info->data_window.max_x);
+        tinyexr::swap4(&info->data_window.max_y);
+        has_data_window = true;
+      }
+    } else if (attr_name.compare("displayWindow") == 0) {
+      if (data.size() >= 16) {
+        memcpy(&info->display_window.min_x, &data.at(0), sizeof(int));
+        memcpy(&info->display_window.min_y, &data.at(4), sizeof(int));
+        memcpy(&info->display_window.max_x, &data.at(8), sizeof(int));
+        memcpy(&info->display_window.max_y, &data.at(12), sizeof(int));
+        tinyexr::swap4(&info->display_window.min_x);
+        tinyexr::swap4(&info->display_window.min_y);
+        tinyexr::swap4(&info->display_window.max_x);
+        tinyexr::swap4(&info->display_window.max_y);
+
+        has_display_window = true;
+      }
+    } else if (attr_name.compare("lineOrder") == 0) {
+      if (data.size() >= 1) {
+        info->line_order = static_cast<int>(data[0]);
+        has_line_order = true;
+      }
+    } else if (attr_name.compare("pixelAspectRatio") == 0) {
+      if (data.size() >= sizeof(float)) {
+        memcpy(&info->pixel_aspect_ratio, &data.at(0), sizeof(float));
+        tinyexr::swap4(&info->pixel_aspect_ratio);
+        has_pixel_aspect_ratio = true;
+      }
+    } else if (attr_name.compare("screenWindowCenter") == 0) {
+      if (data.size() >= 8) {
+        memcpy(&info->screen_window_center[0], &data.at(0), sizeof(float));
+        memcpy(&info->screen_window_center[1], &data.at(4), sizeof(float));
+        tinyexr::swap4(&info->screen_window_center[0]);
+        tinyexr::swap4(&info->screen_window_center[1]);
+        has_screen_window_center = true;
+      }
+    } else if (attr_name.compare("screenWindowWidth") == 0) {
+      if (data.size() >= sizeof(float)) {
+        memcpy(&info->screen_window_width, &data.at(0), sizeof(float));
+        tinyexr::swap4(&info->screen_window_width);
+
+        has_screen_window_width = true;
+      }
+    } else if (attr_name.compare("chunkCount") == 0) {
+      if (data.size() >= sizeof(int)) {
+        memcpy(&info->chunk_count, &data.at(0), sizeof(int));
+        tinyexr::swap4(&info->chunk_count);
+      }
+    } else if (attr_name.compare("name") == 0) {
+      if (!data.empty() && data[0]) {
+        data.push_back(0);
+        size_t len = strlen(reinterpret_cast<const char*>(&data[0]));
+        info->name.resize(len);
+        info->name.assign(reinterpret_cast<const char*>(&data[0]), len);
+        has_name = true;
+      }
+    } else if (attr_name.compare("type") == 0) {
+      if (!data.empty() && data[0]) {
+        data.push_back(0);
+        size_t len = strlen(reinterpret_cast<const char*>(&data[0]));
+        info->type.resize(len);
+        info->type.assign(reinterpret_cast<const char*>(&data[0]), len);
+        has_type = true;
+      }
+    } else {
+      // Custom attribute(up to TINYEXR_MAX_CUSTOM_ATTRIBUTES)
+      if (info->attributes.size() < TINYEXR_MAX_CUSTOM_ATTRIBUTES) {
+        EXRAttribute attrib;
+#ifdef _MSC_VER
+        strncpy_s(attrib.name, attr_name.c_str(), 255);
+        strncpy_s(attrib.type, attr_type.c_str(), 255);
+#else
+        strncpy(attrib.name, attr_name.c_str(), 255);
+        strncpy(attrib.type, attr_type.c_str(), 255);
+#endif
+        attrib.name[255] = '\0';
+        attrib.type[255] = '\0';
+        //std::cout << "i = " << info->attributes.size() << ", dsize = " << data.size() << "\n";
+        attrib.size = static_cast<int>(data.size());
+        attrib.value = static_cast<unsigned char *>(malloc(data.size()));
+        memcpy(reinterpret_cast<char *>(attrib.value), &data.at(0),
+               data.size());
+        info->attributes.push_back(attrib);
+      }
+    }
+  }
+
+  // Check if required attributes exist
+  {
+    std::stringstream ss_err;
+
+    if (!has_compression) {
+      ss_err << "\"compression\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_channels) {
+      ss_err << "\"channels\" attribute not found in the header." << std::endl;
+    }
+
+    if (!has_line_order) {
+      ss_err << "\"lineOrder\" attribute not found in the header." << std::endl;
+    }
+
+    if (!has_display_window) {
+      ss_err << "\"displayWindow\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_data_window) {
+      ss_err << "\"dataWindow\" attribute not found in the header or invalid."
+             << std::endl;
+    }
+
+    if (!has_pixel_aspect_ratio) {
+      ss_err << "\"pixelAspectRatio\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_screen_window_width) {
+      ss_err << "\"screenWindowWidth\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_screen_window_center) {
+      ss_err << "\"screenWindowCenter\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (version->multipart || version->non_image) {
+      if (!has_name) {
+        ss_err << "\"name\" attribute not found in the header."
+          << std::endl;
+      }
+      if (!has_type) {
+        ss_err << "\"type\" attribute not found in the header."
+          << std::endl;
+      }
+    }
+
+    if (!(ss_err.str().empty())) {
+      if (err) {
+        (*err) += ss_err.str();
+      }
+
+      return TINYEXR_ERROR_INVALID_HEADER;
+    }
+  }
+
+  info->header_len = static_cast<unsigned int>(orig_size - size);
+
+  return TINYEXR_SUCCESS;
+}
+
+// C++ HeaderInfo to C EXRHeader conversion.
+static bool ConvertHeader(EXRHeader *exr_header, const HeaderInfo &info, std::string *warn, std::string *err) {
+  exr_header->pixel_aspect_ratio = info.pixel_aspect_ratio;
+  exr_header->screen_window_center[0] = info.screen_window_center[0];
+  exr_header->screen_window_center[1] = info.screen_window_center[1];
+  exr_header->screen_window_width = info.screen_window_width;
+  exr_header->chunk_count = info.chunk_count;
+  exr_header->display_window.min_x = info.display_window.min_x;
+  exr_header->display_window.min_y = info.display_window.min_y;
+  exr_header->display_window.max_x = info.display_window.max_x;
+  exr_header->display_window.max_y = info.display_window.max_y;
+  exr_header->data_window.min_x = info.data_window.min_x;
+  exr_header->data_window.min_y = info.data_window.min_y;
+  exr_header->data_window.max_x = info.data_window.max_x;
+  exr_header->data_window.max_y = info.data_window.max_y;
+  exr_header->line_order = info.line_order;
+  exr_header->compression_type = info.compression_type;
+  exr_header->tiled = info.tiled;
+  exr_header->tile_size_x = info.tile_size_x;
+  exr_header->tile_size_y = info.tile_size_y;
+  exr_header->tile_level_mode = info.tile_level_mode;
+  exr_header->tile_rounding_mode = info.tile_rounding_mode;
+
+  EXRSetNameAttr(exr_header, info.name.c_str());
+
+
+  if (!info.type.empty()) {
+    bool valid = true;
+    if (info.type == "scanlineimage") {
+      if (exr_header->tiled) {
+        if (err) {
+          (*err) += "(ConvertHeader) tiled bit must be off for `scanlineimage` type.\n";
+        }
+        valid = false;
+      }
+    } else if (info.type == "tiledimage") {
+      if (!exr_header->tiled) {
+        if (err) {
+          (*err) += "(ConvertHeader) tiled bit must be on for `tiledimage` type.\n";
+        }
+        valid = false;
+      }
+    } else if (info.type == "deeptile") {
+      exr_header->non_image = 1;
+      if (!exr_header->tiled) {
+        if (err) {
+          (*err) += "(ConvertHeader) tiled bit must be on for `deeptile` type.\n";
+        }
+        valid = false;
+      }
+    } else if (info.type == "deepscanline") {
+      exr_header->non_image = 1;
+      if (exr_header->tiled) {
+        if (err) {
+          (*err) += "(ConvertHeader) tiled bit must be off for `deepscanline` type.\n";
+        }
+        //valid = false;
+      }
+    } else {
+      if (warn) {
+        std::stringstream ss;
+        ss << "(ConvertHeader) Unsupported or unknown info.type: " << info.type << "\n";
+        (*warn) += ss.str();
+      }
+    }
+
+    if (!valid) {
+      return false;
+    }
+  }
+
+  exr_header->num_channels = static_cast<int>(info.channels.size());
+
+  exr_header->channels = static_cast<EXRChannelInfo *>(malloc(
+      sizeof(EXRChannelInfo) * static_cast<size_t>(exr_header->num_channels)));
+  for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+#ifdef _MSC_VER
+    strncpy_s(exr_header->channels[c].name, info.channels[c].name.c_str(), 255);
+#else
+    strncpy(exr_header->channels[c].name, info.channels[c].name.c_str(), 255);
+#endif
+    // manually add '\0' for safety.
+    exr_header->channels[c].name[255] = '\0';
+
+    exr_header->channels[c].pixel_type = info.channels[c].pixel_type;
+    exr_header->channels[c].p_linear = info.channels[c].p_linear;
+    exr_header->channels[c].x_sampling = info.channels[c].x_sampling;
+    exr_header->channels[c].y_sampling = info.channels[c].y_sampling;
+  }
+
+  exr_header->pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(exr_header->num_channels)));
+  for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+    exr_header->pixel_types[c] = info.channels[c].pixel_type;
+  }
+
+  // Initially fill with values of `pixel_types`
+  exr_header->requested_pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(exr_header->num_channels)));
+  for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+    exr_header->requested_pixel_types[c] = info.channels[c].pixel_type;
+  }
+
+  exr_header->num_custom_attributes = static_cast<int>(info.attributes.size());
+
+  if (exr_header->num_custom_attributes > 0) {
+    // TODO(syoyo): Report warning when # of attributes exceeds
+    // `TINYEXR_MAX_CUSTOM_ATTRIBUTES`
+    if (exr_header->num_custom_attributes > TINYEXR_MAX_CUSTOM_ATTRIBUTES) {
+      exr_header->num_custom_attributes = TINYEXR_MAX_CUSTOM_ATTRIBUTES;
+    }
+
+    exr_header->custom_attributes = static_cast<EXRAttribute *>(malloc(
+        sizeof(EXRAttribute) * size_t(exr_header->num_custom_attributes)));
+
+    for (size_t i = 0; i < size_t(exr_header->num_custom_attributes); i++) {
+      memcpy(exr_header->custom_attributes[i].name, info.attributes[i].name,
+             256);
+      memcpy(exr_header->custom_attributes[i].type, info.attributes[i].type,
+             256);
+      exr_header->custom_attributes[i].size = info.attributes[i].size;
+      // Just copy pointer
+      exr_header->custom_attributes[i].value = info.attributes[i].value;
+    }
+
+  } else {
+    exr_header->custom_attributes = NULL;
+  }
+
+  exr_header->header_len = info.header_len;
+
+  return true;
+}
+
+struct OffsetData {
+  OffsetData() : num_x_levels(0), num_y_levels(0) {}
+  std::vector<std::vector<std::vector <tinyexr::tinyexr_uint64> > > offsets;
+  int num_x_levels;
+  int num_y_levels;
+};
+
+// -1 = error
+static int LevelIndex(int lx, int ly, int tile_level_mode, int num_x_levels) {
+  switch (tile_level_mode) {
+  case TINYEXR_TILE_ONE_LEVEL:
+    return 0;
+
+  case TINYEXR_TILE_MIPMAP_LEVELS:
+    return lx;
+
+  case TINYEXR_TILE_RIPMAP_LEVELS:
+    return lx + ly * num_x_levels;
+
+  default:
+    return -1;
+  }
+//  return 0;
+}
+
+static int LevelSize(int toplevel_size, int level, int tile_rounding_mode) {
+  if (level < 0) {
+    return -1;
+  }
+
+  int b = static_cast<int>(1u << static_cast<unsigned int>(level));
+  int level_size = toplevel_size / b;
+
+  if (tile_rounding_mode == TINYEXR_TILE_ROUND_UP && level_size * b < toplevel_size)
+    level_size += 1;
+
+  return std::max(level_size, 1);
+}
+
+static int DecodeTiledLevel(EXRImage* exr_image, const EXRHeader* exr_header,
+  const OffsetData& offset_data,
+  const std::vector<size_t>& channel_offset_list,
+  int pixel_data_size,
+  const unsigned char* head, const size_t size,
+  std::string* err) {
+  int num_channels = exr_header->num_channels;
+
+  int level_index = LevelIndex(exr_image->level_x, exr_image->level_y, exr_header->tile_level_mode, offset_data.num_x_levels);
+  int num_y_tiles = int(offset_data.offsets[size_t(level_index)].size());
+  if (num_y_tiles < 1) {
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+  int num_x_tiles = int(offset_data.offsets[size_t(level_index)][0].size());
+  if (num_x_tiles < 1) {
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+  int num_tiles = num_x_tiles * num_y_tiles;
+
+  int err_code = TINYEXR_SUCCESS;
+
+  enum {
+    EF_SUCCESS = 0,
+    EF_INVALID_DATA = 1,
+    EF_INSUFFICIENT_DATA = 2,
+    EF_FAILED_TO_DECODE = 4
+  };
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+  std::atomic<unsigned> error_flag(EF_SUCCESS);
+#else
+  unsigned error_flag(EF_SUCCESS);
+#endif
+
+  // Although the spec says : "...the data window is subdivided into an array of smaller rectangles...",
+  // the IlmImf library allows the dimensions of the tile to be larger (or equal) than the dimensions of the data window.
+#if 0
+  if ((exr_header->tile_size_x > exr_image->width || exr_header->tile_size_y > exr_image->height) &&
+    exr_image->level_x == 0 && exr_image->level_y == 0) {
+    if (err) {
+      (*err) += "Failed to decode tile data.\n";
+    }
+    err_code = TINYEXR_ERROR_INVALID_DATA;
+  }
+#endif
+  exr_image->tiles = static_cast<EXRTile*>(
+    calloc(sizeof(EXRTile), static_cast<size_t>(num_tiles)));
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+  std::vector<std::thread> workers;
+  std::atomic<int> tile_count(0);
+
+  int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
+  if (num_threads > int(num_tiles)) {
+    num_threads = int(num_tiles);
+  }
+
+  for (int t = 0; t < num_threads; t++) {
+    workers.emplace_back(std::thread([&]()
+      {
+        int tile_idx = 0;
+        while ((tile_idx = tile_count++) < num_tiles) {
+
+#else
+#if TINYEXR_USE_OPENMP
+#pragma omp parallel for
+#endif
+  for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+#endif
+    // Allocate memory for each tile.
+    bool alloc_success = false;
+    exr_image->tiles[tile_idx].images = tinyexr::AllocateImage(
+      num_channels, exr_header->channels,
+      exr_header->requested_pixel_types, exr_header->tile_size_x,
+      exr_header->tile_size_y, &alloc_success);
+
+    if (!alloc_success) {
+      error_flag |= EF_INVALID_DATA;
+      continue;
+    }
+
+    int x_tile = tile_idx % num_x_tiles;
+    int y_tile = tile_idx / num_x_tiles;
+    // 16 byte: tile coordinates
+    // 4 byte : data size
+    // ~      : data(uncompressed or compressed)
+    tinyexr::tinyexr_uint64 offset = offset_data.offsets[size_t(level_index)][size_t(y_tile)][size_t(x_tile)];
+    if (offset + sizeof(int) * 5 > size) {
+      // Insufficient data size.
+      error_flag |= EF_INSUFFICIENT_DATA;
+      continue;
+    }
+
+    size_t data_size =
+      size_t(size - (offset + sizeof(int) * 5));
+    const unsigned char* data_ptr =
+      reinterpret_cast<const unsigned char*>(head + offset);
+
+    int tile_coordinates[4];
+    memcpy(tile_coordinates, data_ptr, sizeof(int) * 4);
+    tinyexr::swap4(&tile_coordinates[0]);
+    tinyexr::swap4(&tile_coordinates[1]);
+    tinyexr::swap4(&tile_coordinates[2]);
+    tinyexr::swap4(&tile_coordinates[3]);
+
+    if (tile_coordinates[2] != exr_image->level_x) {
+      // Invalid data.
+      error_flag |= EF_INVALID_DATA;
+      continue;
+    }
+    if (tile_coordinates[3] != exr_image->level_y) {
+      // Invalid data.
+      error_flag |= EF_INVALID_DATA;
+      continue;
+    }
+
+    int data_len;
+    memcpy(&data_len, data_ptr + 16,
+      sizeof(int));  // 16 = sizeof(tile_coordinates)
+    tinyexr::swap4(&data_len);
+
+    if (data_len < 2 || size_t(data_len) > data_size) {
+      // Insufficient data size.
+      error_flag |= EF_INSUFFICIENT_DATA;
+      continue;
+    }
+
+    // Move to data addr: 20 = 16 + 4;
+    data_ptr += 20;
+    bool ret = tinyexr::DecodeTiledPixelData(
+      exr_image->tiles[tile_idx].images,
+      &(exr_image->tiles[tile_idx].width),
+      &(exr_image->tiles[tile_idx].height),
+      exr_header->requested_pixel_types, data_ptr,
+      static_cast<size_t>(data_len), exr_header->compression_type,
+      exr_header->line_order,
+      exr_image->width, exr_image->height,
+      tile_coordinates[0], tile_coordinates[1], exr_header->tile_size_x,
+      exr_header->tile_size_y, static_cast<size_t>(pixel_data_size),
+      static_cast<size_t>(exr_header->num_custom_attributes),
+      exr_header->custom_attributes,
+      static_cast<size_t>(exr_header->num_channels),
+      exr_header->channels, channel_offset_list);
+
+    if (!ret) {
+      // Failed to decode tile data.
+      error_flag |= EF_FAILED_TO_DECODE;
+    }
+
+    exr_image->tiles[tile_idx].offset_x = tile_coordinates[0];
+    exr_image->tiles[tile_idx].offset_y = tile_coordinates[1];
+    exr_image->tiles[tile_idx].level_x = tile_coordinates[2];
+    exr_image->tiles[tile_idx].level_y = tile_coordinates[3];
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+  }
+        }));
+    }  // num_thread loop
+
+    for (auto& t : workers) {
+      t.join();
+    }
+
+#else
+  } // parallel for
+#endif
+
+  // Even in the event of an error, the reserved memory may be freed.
+  exr_image->num_channels = num_channels;
+  exr_image->num_tiles = static_cast<int>(num_tiles);
+
+  if (error_flag)  err_code = TINYEXR_ERROR_INVALID_DATA;
+  if (err) {
+    if (error_flag & EF_INSUFFICIENT_DATA) {
+      (*err) += "Insufficient data length.\n";
+    }
+    if (error_flag & EF_FAILED_TO_DECODE) {
+      (*err) += "Failed to decode tile data.\n";
+    }
+  }
+  return err_code;
+}
+
+static int DecodeChunk(EXRImage *exr_image, const EXRHeader *exr_header,
+                       const OffsetData& offset_data,
+                       const unsigned char *head, const size_t size,
+                       std::string *err) {
+  int num_channels = exr_header->num_channels;
+
+  int num_scanline_blocks = 1;
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+    num_scanline_blocks = 16;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    num_scanline_blocks = 32;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    num_scanline_blocks = 16;
+
+#if TINYEXR_USE_ZFP
+    tinyexr::ZFPCompressionParam zfp_compression_param;
+    if (!FindZFPCompressionParam(&zfp_compression_param,
+                                 exr_header->custom_attributes,
+                                 int(exr_header->num_custom_attributes), err)) {
+      return TINYEXR_ERROR_INVALID_HEADER;
+    }
+#endif
+  }
+
+  if (exr_header->data_window.max_x < exr_header->data_window.min_x ||
+      exr_header->data_window.max_y < exr_header->data_window.min_y) {
+    if (err) {
+      (*err) += "Invalid data window.\n";
+    }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  tinyexr_int64 data_width =
+      static_cast<tinyexr_int64>(exr_header->data_window.max_x) - static_cast<tinyexr_int64>(exr_header->data_window.min_x) + static_cast<tinyexr_int64>(1);
+  tinyexr_int64 data_height =
+      static_cast<tinyexr_int64>(exr_header->data_window.max_y) - static_cast<tinyexr_int64>(exr_header->data_window.min_y) + static_cast<tinyexr_int64>(1);
+
+  if (data_width <= 0) {
+    if (err) {
+      (*err) += "Invalid data window width.\n";
+    }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  if (data_height <= 0) {
+    if (err) {
+      (*err) += "Invalid data window height.\n";
+    }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  // Do not allow too large data_width and data_height. header invalid?
+  {
+    if ((data_width > TINYEXR_DIMENSION_THRESHOLD) || (data_height > TINYEXR_DIMENSION_THRESHOLD)) {
+      if (err) {
+        std::stringstream ss;
+        ss << "data_with or data_height too large. data_width: " << data_width
+           << ", "
+           << "data_height = " << data_height << std::endl;
+        (*err) += ss.str();
+      }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+    if (exr_header->tiled) {
+      if ((exr_header->tile_size_x > TINYEXR_DIMENSION_THRESHOLD) || (exr_header->tile_size_y > TINYEXR_DIMENSION_THRESHOLD)) {
+        if (err) {
+          std::stringstream ss;
+          ss << "tile with or tile height too large. tile width: " << exr_header->tile_size_x
+            << ", "
+            << "tile height = " << exr_header->tile_size_y << std::endl;
+          (*err) += ss.str();
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+    }
+  }
+
+  const std::vector<tinyexr::tinyexr_uint64>& offsets = offset_data.offsets[0][0];
+  size_t num_blocks = offsets.size();
+
+  std::vector<size_t> channel_offset_list;
+  int pixel_data_size = 0;
+  size_t channel_offset = 0;
+  if (!tinyexr::ComputeChannelLayout(&channel_offset_list, &pixel_data_size,
+                                     &channel_offset, num_channels,
+                                     exr_header->channels)) {
+    if (err) {
+      (*err) += "Failed to compute channel layout.\n";
+    }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+  std::atomic<bool> invalid_data(false);
+#else
+  bool invalid_data(false);
+#endif
+
+  if (exr_header->tiled) {
+    // value check
+    if (exr_header->tile_size_x < 0) {
+      if (err) {
+        std::stringstream ss;
+        ss << "Invalid tile size x : " << exr_header->tile_size_x << "\n";
+        (*err) += ss.str();
+      }
+      return TINYEXR_ERROR_INVALID_HEADER;
+    }
+
+    if (exr_header->tile_size_y < 0) {
+      if (err) {
+        std::stringstream ss;
+        ss << "Invalid tile size y : " << exr_header->tile_size_y << "\n";
+        (*err) += ss.str();
+      }
+      return TINYEXR_ERROR_INVALID_HEADER;
+    }
+    if (exr_header->tile_level_mode != TINYEXR_TILE_RIPMAP_LEVELS) {
+      EXRImage* level_image = NULL;
+      for (int level = 0; level < offset_data.num_x_levels; ++level) {
+        if (!level_image) {
+          level_image = exr_image;
+        } else {
+          level_image->next_level = new EXRImage;
+          InitEXRImage(level_image->next_level);
+          level_image = level_image->next_level;
+        }
+        level_image->width =
+          LevelSize(exr_header->data_window.max_x - exr_header->data_window.min_x + 1, level, exr_header->tile_rounding_mode);
+        if (level_image->width < 1) {
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+
+        level_image->height =
+          LevelSize(exr_header->data_window.max_y - exr_header->data_window.min_y + 1, level, exr_header->tile_rounding_mode);
+
+        if (level_image->height < 1) {
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+
+        level_image->level_x = level;
+        level_image->level_y = level;
+
+        int ret = DecodeTiledLevel(level_image, exr_header,
+          offset_data,
+          channel_offset_list,
+          pixel_data_size,
+          head, size,
+          err);
+        if (ret != TINYEXR_SUCCESS) return ret;
+      }
+    } else {
+      EXRImage* level_image = NULL;
+      for (int level_y = 0; level_y < offset_data.num_y_levels; ++level_y)
+        for (int level_x = 0; level_x < offset_data.num_x_levels; ++level_x) {
+          if (!level_image) {
+            level_image = exr_image;
+          } else {
+            level_image->next_level = new EXRImage;
+            InitEXRImage(level_image->next_level);
+            level_image = level_image->next_level;
+          }
+
+          level_image->width =
+            LevelSize(exr_header->data_window.max_x - exr_header->data_window.min_x + 1, level_x, exr_header->tile_rounding_mode);
+          if (level_image->width < 1) {
+            return TINYEXR_ERROR_INVALID_DATA;
+          }
+
+          level_image->height =
+            LevelSize(exr_header->data_window.max_y - exr_header->data_window.min_y + 1, level_y, exr_header->tile_rounding_mode);
+          if (level_image->height < 1) {
+            return TINYEXR_ERROR_INVALID_DATA;
+          }
+
+          level_image->level_x = level_x;
+          level_image->level_y = level_y;
+
+          int ret = DecodeTiledLevel(level_image, exr_header,
+            offset_data,
+            channel_offset_list,
+            pixel_data_size,
+            head, size,
+            err);
+          if (ret != TINYEXR_SUCCESS) return ret;
+        }
+    }
+  } else {  // scanline format
+    // Don't allow too large image(256GB * pixel_data_size or more). Workaround
+    // for #104.
+    size_t total_data_len =
+        size_t(data_width) * size_t(data_height) * size_t(num_channels);
+    const bool total_data_len_overflown =
+        sizeof(void *) == 8 ? (total_data_len >= 0x4000000000) : false;
+    if ((total_data_len == 0) || total_data_len_overflown) {
+      if (err) {
+        std::stringstream ss;
+        ss << "Image data size is zero or too large: width = " << data_width
+           << ", height = " << data_height << ", channels = " << num_channels
+           << std::endl;
+        (*err) += ss.str();
+      }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    bool alloc_success = false;
+    exr_image->images = tinyexr::AllocateImage(
+        num_channels, exr_header->channels, exr_header->requested_pixel_types,
+        int(data_width), int(data_height), &alloc_success);
+
+    if (!alloc_success) {
+      if (err) {
+        std::stringstream ss;
+        ss << "Failed to allocate memory for Images. Maybe EXR header is corrupted or Image data size is too large: width = " << data_width
+           << ", height = " << data_height << ", channels = " << num_channels
+           << std::endl;
+        (*err) += ss.str();
+      }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+    std::vector<std::thread> workers;
+    std::atomic<int> y_count(0);
+
+    int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
+    if (num_threads > int(num_blocks)) {
+      num_threads = int(num_blocks);
+    }
+
+    for (int t = 0; t < num_threads; t++) {
+      workers.emplace_back(std::thread([&]() {
+        int y = 0;
+        while ((y = y_count++) < int(num_blocks)) {
+
+#else
+
+#if TINYEXR_USE_OPENMP
+#pragma omp parallel for
+#endif
+    for (int y = 0; y < static_cast<int>(num_blocks); y++) {
+
+#endif
+          size_t y_idx = static_cast<size_t>(y);
+
+          if (offsets[y_idx] + sizeof(int) * 2 > size) {
+            invalid_data = true;
+          } else {
+            // 4 byte: scan line
+            // 4 byte: data size
+            // ~     : pixel data(uncompressed or compressed)
+            size_t data_size =
+                size_t(size - (offsets[y_idx] + sizeof(int) * 2));
+            const unsigned char *data_ptr =
+                reinterpret_cast<const unsigned char *>(head + offsets[y_idx]);
+
+            int line_no;
+            memcpy(&line_no, data_ptr, sizeof(int));
+            int data_len;
+            memcpy(&data_len, data_ptr + 4, sizeof(int));
+            tinyexr::swap4(&line_no);
+            tinyexr::swap4(&data_len);
+
+            if (size_t(data_len) > data_size) {
+              invalid_data = true;
+
+            } else if ((line_no > (2 << 20)) || (line_no < -(2 << 20))) {
+              // Too large value. Assume this is invalid
+              // 2**20 = 1048576 = heuristic value.
+              invalid_data = true;
+            } else if (data_len == 0) {
+              // TODO(syoyo): May be ok to raise the threshold for example
+              // `data_len < 4`
+              invalid_data = true;
+            } else {
+              // line_no may be negative.
+              int end_line_no = (std::min)(line_no + num_scanline_blocks,
+                                           (exr_header->data_window.max_y + 1));
+
+              int num_lines = end_line_no - line_no;
+
+              if (num_lines <= 0) {
+                invalid_data = true;
+              } else {
+                // Move to data addr: 8 = 4 + 4;
+                data_ptr += 8;
+
+                // Adjust line_no with data_window.bmin.y
+
+                // overflow check
+                tinyexr_int64 lno =
+                    static_cast<tinyexr_int64>(line_no) -
+                    static_cast<tinyexr_int64>(exr_header->data_window.min_y);
+                if (lno > std::numeric_limits<int>::max()) {
+                  line_no = -1;  // invalid
+                } else if (lno < -std::numeric_limits<int>::max()) {
+                  line_no = -1;  // invalid
+                } else {
+                  line_no -= exr_header->data_window.min_y;
+                }
+
+                if (line_no < 0) {
+                  invalid_data = true;
+                } else {
+                  if (!tinyexr::DecodePixelData(
+                          exr_image->images, exr_header->requested_pixel_types,
+                          data_ptr, static_cast<size_t>(data_len),
+                          exr_header->compression_type, exr_header->line_order,
+                          int(data_width), int(data_height), int(data_width), y, line_no,
+                          num_lines, static_cast<size_t>(pixel_data_size),
+                          static_cast<size_t>(
+                              exr_header->num_custom_attributes),
+                          exr_header->custom_attributes,
+                          static_cast<size_t>(exr_header->num_channels),
+                          exr_header->channels, channel_offset_list)) {
+                    invalid_data = true;
+                  }
+                }
+              }
+            }
+          }
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+        }
+      }));
+    }
+
+    for (auto &t : workers) {
+      t.join();
+    }
+#else
+    }  // omp parallel
+#endif
+  }
+
+  if (invalid_data) {
+    if (err) {
+      (*err) += "Invalid/Corrupted data found when decoding pixels.\n";
+    }
+
+    // free alloced image.
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      if (exr_image->images[c]) {
+        free(exr_image->images[c]);
+        exr_image->images[c] = NULL;
+      }
+    }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  // Overwrite `pixel_type` with `requested_pixel_type`.
+  {
+    for (int c = 0; c < exr_header->num_channels; c++) {
+      exr_header->pixel_types[c] = exr_header->requested_pixel_types[c];
+    }
+  }
+
+  {
+    exr_image->num_channels = num_channels;
+
+    exr_image->width = int(data_width);
+    exr_image->height = int(data_height);
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+static bool ReconstructLineOffsets(
+    std::vector<tinyexr::tinyexr_uint64> *offsets, size_t n,
+    const unsigned char *head, const unsigned char *marker, const size_t size) {
+  if (head >= marker) {
+    return false;
+  }
+  if (offsets->size() != n) {
+    return false;
+  }
+
+  for (size_t i = 0; i < n; i++) {
+    size_t offset = static_cast<size_t>(marker - head);
+    // Offset should not exceed whole EXR file/data size.
+    if ((offset + sizeof(tinyexr::tinyexr_uint64)) >= size) {
+      return false;
+    }
+
+    int y;
+    unsigned int data_len;
+
+    memcpy(&y, marker, sizeof(int));
+    memcpy(&data_len, marker + 4, sizeof(unsigned int));
+
+    if (data_len >= size) {
+      return false;
+    }
+
+    tinyexr::swap4(&y);
+    tinyexr::swap4(&data_len);
+
+    (*offsets)[i] = offset;
+
+    marker += data_len + 8;  // 8 = 4 bytes(y) + 4 bytes(data_len)
+  }
+
+  return true;
+}
+
+
+static int FloorLog2(unsigned x) {
+  //
+  // For x > 0, floorLog2(y) returns floor(log(x)/log(2)).
+  //
+  int y = 0;
+  while (x > 1) {
+    y += 1;
+    x >>= 1u;
+  }
+  return y;
+}
+
+
+static int CeilLog2(unsigned x) {
+  //
+  // For x > 0, ceilLog2(y) returns ceil(log(x)/log(2)).
+  //
+  int y = 0;
+  int r = 0;
+  while (x > 1) {
+    if (x & 1)
+      r = 1;
+
+    y += 1;
+    x >>= 1u;
+  }
+  return y + r;
+}
+
+static int RoundLog2(int x, int tile_rounding_mode) {
+  return (tile_rounding_mode == TINYEXR_TILE_ROUND_DOWN) ? FloorLog2(static_cast<unsigned>(x)) : CeilLog2(static_cast<unsigned>(x));
+}
+
+static int CalculateNumXLevels(const EXRHeader* exr_header) {
+  int min_x = exr_header->data_window.min_x;
+  int max_x = exr_header->data_window.max_x;
+  int min_y = exr_header->data_window.min_y;
+  int max_y = exr_header->data_window.max_y;
+
+  int num = 0;
+  switch (exr_header->tile_level_mode) {
+  case TINYEXR_TILE_ONE_LEVEL:
+
+    num = 1;
+    break;
+
+  case TINYEXR_TILE_MIPMAP_LEVELS:
+
+  {
+    int w = max_x - min_x + 1;
+    int h = max_y - min_y + 1;
+    num = RoundLog2(std::max(w, h), exr_header->tile_rounding_mode) + 1;
+  }
+  break;
+
+  case TINYEXR_TILE_RIPMAP_LEVELS:
+
+  {
+    int w = max_x - min_x + 1;
+    num = RoundLog2(w, exr_header->tile_rounding_mode) + 1;
+  }
+  break;
+
+  default:
+
+    return -1;
+  }
+
+  return num;
+}
+
+static int CalculateNumYLevels(const EXRHeader* exr_header) {
+  int min_x = exr_header->data_window.min_x;
+  int max_x = exr_header->data_window.max_x;
+  int min_y = exr_header->data_window.min_y;
+  int max_y = exr_header->data_window.max_y;
+  int num = 0;
+
+  switch (exr_header->tile_level_mode) {
+  case TINYEXR_TILE_ONE_LEVEL:
+
+    num = 1;
+    break;
+
+  case TINYEXR_TILE_MIPMAP_LEVELS:
+
+  {
+    int w = max_x - min_x + 1;
+    int h = max_y - min_y + 1;
+    num = RoundLog2(std::max(w, h), exr_header->tile_rounding_mode) + 1;
+  }
+  break;
+
+  case TINYEXR_TILE_RIPMAP_LEVELS:
+
+  {
+    int h = max_y - min_y + 1;
+    num = RoundLog2(h, exr_header->tile_rounding_mode) + 1;
+  }
+  break;
+
+  default:
+
+    return -1;
+  }
+
+  return num;
+}
+
+static bool CalculateNumTiles(std::vector<int>& numTiles,
+  int toplevel_size,
+  int size,
+  int tile_rounding_mode) {
+  for (unsigned i = 0; i < numTiles.size(); i++) {
+    int l = LevelSize(toplevel_size, int(i), tile_rounding_mode);
+    if (l < 0) {
+      return false;
+    }
+    TINYEXR_CHECK_AND_RETURN_C(l <= std::numeric_limits<int>::max() - size + 1, false);
+
+    numTiles[i] = (l + size - 1) / size;
+  }
+  return true;
+}
+
+static bool PrecalculateTileInfo(std::vector<int>& num_x_tiles,
+  std::vector<int>& num_y_tiles,
+  const EXRHeader* exr_header) {
+  int min_x = exr_header->data_window.min_x;
+  int max_x = exr_header->data_window.max_x;
+  int min_y = exr_header->data_window.min_y;
+  int max_y = exr_header->data_window.max_y;
+
+  int num_x_levels = CalculateNumXLevels(exr_header);
+
+  if (num_x_levels < 0) {
+    return false;
+  }
+
+  int num_y_levels = CalculateNumYLevels(exr_header);
+
+  if (num_y_levels < 0) {
+    return false;
+  }
+
+  num_x_tiles.resize(size_t(num_x_levels));
+  num_y_tiles.resize(size_t(num_y_levels));
+
+  if (!CalculateNumTiles(num_x_tiles,
+    max_x - min_x + 1,
+    exr_header->tile_size_x,
+    exr_header->tile_rounding_mode)) {
+    return false;
+  }
+
+  if (!CalculateNumTiles(num_y_tiles,
+    max_y - min_y + 1,
+    exr_header->tile_size_y,
+    exr_header->tile_rounding_mode)) {
+    return false;
+  }
+
+  return true;
+}
+
+static void InitSingleResolutionOffsets(OffsetData& offset_data, size_t num_blocks) {
+  offset_data.offsets.resize(1);
+  offset_data.offsets[0].resize(1);
+  offset_data.offsets[0][0].resize(num_blocks);
+  offset_data.num_x_levels = 1;
+  offset_data.num_y_levels = 1;
+}
+
+// Return sum of tile blocks.
+// 0 = error
+static int InitTileOffsets(OffsetData& offset_data,
+  const EXRHeader* exr_header,
+  const std::vector<int>& num_x_tiles,
+  const std::vector<int>& num_y_tiles) {
+  int num_tile_blocks = 0;
+  offset_data.num_x_levels = static_cast<int>(num_x_tiles.size());
+  offset_data.num_y_levels = static_cast<int>(num_y_tiles.size());
+  switch (exr_header->tile_level_mode) {
+  case TINYEXR_TILE_ONE_LEVEL:
+  case TINYEXR_TILE_MIPMAP_LEVELS:
+    TINYEXR_CHECK_AND_RETURN_C(offset_data.num_x_levels == offset_data.num_y_levels, 0);
+    offset_data.offsets.resize(size_t(offset_data.num_x_levels));
+
+    for (unsigned int l = 0; l < offset_data.offsets.size(); ++l) {
+      offset_data.offsets[l].resize(size_t(num_y_tiles[l]));
+
+      for (unsigned int dy = 0; dy < offset_data.offsets[l].size(); ++dy) {
+        offset_data.offsets[l][dy].resize(size_t(num_x_tiles[l]));
+        num_tile_blocks += num_x_tiles[l];
+      }
+    }
+    break;
+
+  case TINYEXR_TILE_RIPMAP_LEVELS:
+
+    offset_data.offsets.resize(static_cast<size_t>(offset_data.num_x_levels) * static_cast<size_t>(offset_data.num_y_levels));
+
+    for (int ly = 0; ly < offset_data.num_y_levels; ++ly) {
+      for (int lx = 0; lx < offset_data.num_x_levels; ++lx) {
+        int l = ly * offset_data.num_x_levels + lx;
+        offset_data.offsets[size_t(l)].resize(size_t(num_y_tiles[size_t(ly)]));
+
+        for (size_t dy = 0; dy < offset_data.offsets[size_t(l)].size(); ++dy) {
+          offset_data.offsets[size_t(l)][dy].resize(size_t(num_x_tiles[size_t(lx)]));
+          num_tile_blocks += num_x_tiles[size_t(lx)];
+        }
+      }
+    }
+    break;
+
+  default:
+    return 0;
+  }
+  return num_tile_blocks;
+}
+
+static bool IsAnyOffsetsAreInvalid(const OffsetData& offset_data) {
+  for (unsigned int l = 0; l < offset_data.offsets.size(); ++l)
+    for (unsigned int dy = 0; dy < offset_data.offsets[l].size(); ++dy)
+      for (unsigned int dx = 0; dx < offset_data.offsets[l][dy].size(); ++dx)
+        if (reinterpret_cast<const tinyexr::tinyexr_int64&>(offset_data.offsets[l][dy][dx]) <= 0)
+          return true;
+
+  return false;
+}
+
+static bool isValidTile(const EXRHeader* exr_header,
+                        const OffsetData& offset_data,
+                        int dx, int dy, int lx, int ly) {
+  if (lx < 0 || ly < 0 || dx < 0 || dy < 0) return false;
+  int num_x_levels = offset_data.num_x_levels;
+  int num_y_levels = offset_data.num_y_levels;
+  switch (exr_header->tile_level_mode) {
+  case TINYEXR_TILE_ONE_LEVEL:
+
+    if (lx == 0 &&
+        ly == 0 &&
+        offset_data.offsets.size() > 0 &&
+        offset_data.offsets[0].size() > static_cast<size_t>(dy) &&
+        offset_data.offsets[0][size_t(dy)].size() > static_cast<size_t>(dx)) {
+      return true;
+    }
+
+    break;
+
+  case TINYEXR_TILE_MIPMAP_LEVELS:
+
+    if (lx < num_x_levels &&
+        ly < num_y_levels &&
+        offset_data.offsets.size() > static_cast<size_t>(lx) &&
+        offset_data.offsets[size_t(lx)].size() > static_cast<size_t>(dy) &&
+        offset_data.offsets[size_t(lx)][size_t(dy)].size() > static_cast<size_t>(dx)) {
+      return true;
+    }
+
+    break;
+
+  case TINYEXR_TILE_RIPMAP_LEVELS:
+  {
+    size_t idx = static_cast<size_t>(lx) + static_cast<size_t>(ly)* static_cast<size_t>(num_x_levels);
+    if (lx < num_x_levels &&
+       ly < num_y_levels &&
+       (offset_data.offsets.size() > idx) &&
+       offset_data.offsets[idx].size() > static_cast<size_t>(dy) &&
+       offset_data.offsets[idx][size_t(dy)].size() > static_cast<size_t>(dx)) {
+      return true;
+    }
+  }
+
+    break;
+
+  default:
+
+    return false;
+  }
+
+  return false;
+}
+
+static bool ReconstructTileOffsets(OffsetData& offset_data,
+                                   const EXRHeader* exr_header,
+                                   const unsigned char* head, const unsigned char* marker, const size_t size,
+                                   bool isMultiPartFile,
+                                   bool isDeep) {
+  int numXLevels = offset_data.num_x_levels;
+  for (unsigned int l = 0; l < offset_data.offsets.size(); ++l) {
+    for (unsigned int dy = 0; dy < offset_data.offsets[l].size(); ++dy) {
+      for (unsigned int dx = 0; dx < offset_data.offsets[l][dy].size(); ++dx) {
+        tinyexr::tinyexr_uint64 tileOffset = tinyexr::tinyexr_uint64(marker - head);
+
+
+        if (isMultiPartFile) {
+          if ((marker + sizeof(int)) >= (head + size)) {
+            return false;
+          }
+
+          //int partNumber;
+          marker += sizeof(int);
+        }
+
+        if ((marker + 4 * sizeof(int)) >= (head + size)) {
+          return false;
+        }
+
+        int tileX;
+        memcpy(&tileX, marker, sizeof(int));
+        tinyexr::swap4(&tileX);
+        marker += sizeof(int);
+
+        int tileY;
+        memcpy(&tileY, marker, sizeof(int));
+        tinyexr::swap4(&tileY);
+        marker += sizeof(int);
+
+        int levelX;
+        memcpy(&levelX, marker, sizeof(int));
+        tinyexr::swap4(&levelX);
+        marker += sizeof(int);
+
+        int levelY;
+        memcpy(&levelY, marker, sizeof(int));
+        tinyexr::swap4(&levelY);
+        marker += sizeof(int);
+
+        if (isDeep) {
+          if ((marker + 2 * sizeof(tinyexr::tinyexr_int64)) >= (head + size)) {
+            return false;
+          }
+          tinyexr::tinyexr_int64 packed_offset_table_size;
+          memcpy(&packed_offset_table_size, marker, sizeof(tinyexr::tinyexr_int64));
+          tinyexr::swap8(reinterpret_cast<tinyexr::tinyexr_uint64*>(&packed_offset_table_size));
+          marker += sizeof(tinyexr::tinyexr_int64);
+
+          tinyexr::tinyexr_int64 packed_sample_size;
+          memcpy(&packed_sample_size, marker, sizeof(tinyexr::tinyexr_int64));
+          tinyexr::swap8(reinterpret_cast<tinyexr::tinyexr_uint64*>(&packed_sample_size));
+          marker += sizeof(tinyexr::tinyexr_int64);
+
+          // next Int64 is unpacked sample size - skip that too
+          marker += packed_offset_table_size + packed_sample_size + 8;
+
+          if (marker >= (head + size)) {
+            return false;
+          }
+
+        } else {
+
+          if ((marker + sizeof(uint32_t)) >= (head + size)) {
+            return false;
+          }
+
+          uint32_t dataSize;
+          memcpy(&dataSize, marker, sizeof(uint32_t));
+          tinyexr::swap4(&dataSize);
+          marker += sizeof(uint32_t);
+
+          marker += dataSize;
+
+          if (marker >= (head + size)) {
+            return false;
+          }
+        }
+
+        if (!isValidTile(exr_header, offset_data,
+          tileX, tileY, levelX, levelY)) {
+          return false;
+        }
+
+        int level_idx = LevelIndex(levelX, levelY, exr_header->tile_level_mode, numXLevels);
+        if (level_idx < 0) {
+          return false;
+        }
+
+        if (size_t(level_idx) >= offset_data.offsets.size()) {
+          return false;
+        }
+
+        if (size_t(tileY) >= offset_data.offsets[size_t(level_idx)].size()) {
+          return false;
+        }
+
+        if (size_t(tileX) >= offset_data.offsets[size_t(level_idx)][size_t(tileY)].size()) {
+          return false;
+        }
+        
+        offset_data.offsets[size_t(level_idx)][size_t(tileY)][size_t(tileX)] = tileOffset;
+      }
+    }
+  }
+  return true;
+}
+
+// marker output is also
+static int ReadOffsets(OffsetData& offset_data,
+                       const unsigned char* head,
+                       const unsigned char*& marker,
+                       const size_t size,
+                       const char** err) {
+  for (unsigned int l = 0; l < offset_data.offsets.size(); ++l) {
+    for (unsigned int dy = 0; dy < offset_data.offsets[l].size(); ++dy) {
+      for (unsigned int dx = 0; dx < offset_data.offsets[l][dy].size(); ++dx) {
+        tinyexr::tinyexr_uint64 offset;
+        if ((marker + sizeof(tinyexr_uint64)) >= (head + size)) {
+          tinyexr::SetErrorMessage("Insufficient data size in offset table.", err);
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+
+        memcpy(&offset, marker, sizeof(tinyexr::tinyexr_uint64));
+        tinyexr::swap8(&offset);
+        if (offset >= size) {
+          tinyexr::SetErrorMessage("Invalid offset value in DecodeEXRImage.", err);
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+        marker += sizeof(tinyexr::tinyexr_uint64);  // = 8
+        offset_data.offsets[l][dy][dx] = offset;
+      }
+    }
+  }
+  return TINYEXR_SUCCESS;
+}
+
+static int DecodeEXRImage(EXRImage *exr_image, const EXRHeader *exr_header,
+                          const unsigned char *head,
+                          const unsigned char *marker, const size_t size,
+                          const char **err) {
+  if (exr_image == NULL || exr_header == NULL || head == NULL ||
+      marker == NULL || (size <= tinyexr::kEXRVersionSize)) {
+    tinyexr::SetErrorMessage("Invalid argument for DecodeEXRImage().", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  int num_scanline_blocks = 1;
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+    num_scanline_blocks = 16;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    num_scanline_blocks = 32;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    num_scanline_blocks = 16;
+  }
+
+  if (exr_header->data_window.max_x < exr_header->data_window.min_x ||
+      exr_header->data_window.max_x - exr_header->data_window.min_x ==
+          std::numeric_limits<int>::max()) {
+    // Issue 63
+    tinyexr::SetErrorMessage("Invalid data width value", err);
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+  tinyexr_int64 data_width =
+      static_cast<tinyexr_int64>(exr_header->data_window.max_x) - static_cast<tinyexr_int64>(exr_header->data_window.min_x) + static_cast<tinyexr_int64>(1);
+  if (data_width <= 0) {
+    tinyexr::SetErrorMessage("Invalid data window width value", err);
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  if (exr_header->data_window.max_y < exr_header->data_window.min_y ||
+      exr_header->data_window.max_y - exr_header->data_window.min_y ==
+          std::numeric_limits<int>::max()) {
+    tinyexr::SetErrorMessage("Invalid data height value", err);
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+  tinyexr_int64 data_height =
+      static_cast<tinyexr_int64>(exr_header->data_window.max_y) - static_cast<tinyexr_int64>(exr_header->data_window.min_y) + static_cast<tinyexr_int64>(1);
+
+  if (data_height <= 0) {
+    tinyexr::SetErrorMessage("Invalid data window height value", err);
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  // Do not allow too large data_width and data_height. header invalid?
+  {
+    if (data_width > TINYEXR_DIMENSION_THRESHOLD) {
+      tinyexr::SetErrorMessage("data width too large.", err);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+    if (data_height > TINYEXR_DIMENSION_THRESHOLD) {
+      tinyexr::SetErrorMessage("data height too large.", err);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+  }
+
+  if (exr_header->tiled) {
+    if (exr_header->tile_size_x > TINYEXR_DIMENSION_THRESHOLD) {
+      tinyexr::SetErrorMessage("tile width too large.", err);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+    if (exr_header->tile_size_y > TINYEXR_DIMENSION_THRESHOLD) {
+      tinyexr::SetErrorMessage("tile height too large.", err);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+  }
+
+  // Read offset tables.
+  OffsetData offset_data;
+  size_t num_blocks = 0;
+  // For a multi-resolution image, the size of the offset table will be calculated from the other attributes of the header.
+  // If chunk_count > 0 then chunk_count must be equal to the calculated tile count.
+  if (exr_header->tiled) {
+    {
+      std::vector<int> num_x_tiles, num_y_tiles;
+      if (!PrecalculateTileInfo(num_x_tiles, num_y_tiles, exr_header)) {
+        tinyexr::SetErrorMessage("Failed to precalculate tile info.", err);
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+      num_blocks = size_t(InitTileOffsets(offset_data, exr_header, num_x_tiles, num_y_tiles));
+      if (exr_header->chunk_count > 0) {
+        if (exr_header->chunk_count != static_cast<int>(num_blocks)) {
+          tinyexr::SetErrorMessage("Invalid offset table size.", err);
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+      }
+    }
+
+    int ret = ReadOffsets(offset_data, head, marker, size, err);
+    if (ret != TINYEXR_SUCCESS) return ret;
+    if (IsAnyOffsetsAreInvalid(offset_data)) {
+      if (!ReconstructTileOffsets(offset_data, exr_header,
+        head, marker, size,
+        exr_header->multipart, exr_header->non_image)) {
+
+          tinyexr::SetErrorMessage("Invalid Tile Offsets data.", err);
+          return TINYEXR_ERROR_INVALID_DATA;
+      }
+    }
+  } else if (exr_header->chunk_count > 0) {
+    // Use `chunkCount` attribute.
+    num_blocks = static_cast<size_t>(exr_header->chunk_count);
+    InitSingleResolutionOffsets(offset_data, num_blocks);
+  } else {
+    num_blocks = static_cast<size_t>(data_height) /
+      static_cast<size_t>(num_scanline_blocks);
+    if (num_blocks * static_cast<size_t>(num_scanline_blocks) <
+      static_cast<size_t>(data_height)) {
+      num_blocks++;
+    }
+
+    InitSingleResolutionOffsets(offset_data, num_blocks);
+  }
+
+  if (!exr_header->tiled) {
+    std::vector<tinyexr::tinyexr_uint64>& offsets = offset_data.offsets[0][0];
+    for (size_t y = 0; y < num_blocks; y++) {
+      tinyexr::tinyexr_uint64 offset;
+      // Issue #81
+      if ((marker + sizeof(tinyexr_uint64)) >= (head + size)) {
+        tinyexr::SetErrorMessage("Insufficient data size in offset table.", err);
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      memcpy(&offset, marker, sizeof(tinyexr::tinyexr_uint64));
+      tinyexr::swap8(&offset);
+      if (offset >= size) {
+        tinyexr::SetErrorMessage("Invalid offset value in DecodeEXRImage.", err);
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+      marker += sizeof(tinyexr::tinyexr_uint64);  // = 8
+      offsets[y] = offset;
+    }
+
+    // If line offsets are invalid, we try to reconstruct it.
+    // See OpenEXR/IlmImf/ImfScanLineInputFile.cpp::readLineOffsets() for details.
+    for (size_t y = 0; y < num_blocks; y++) {
+      if (offsets[y] <= 0) {
+        // TODO(syoyo) Report as warning?
+        // if (err) {
+        //  stringstream ss;
+        //  ss << "Incomplete lineOffsets." << std::endl;
+        //  (*err) += ss.str();
+        //}
+        bool ret =
+          ReconstructLineOffsets(&offsets, num_blocks, head, marker, size);
+        if (ret) {
+          // OK
+          break;
+        } else {
+          tinyexr::SetErrorMessage(
+            "Cannot reconstruct lineOffset table in DecodeEXRImage.", err);
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+      }
+    }
+  }
+
+  {
+    std::string e;
+    int ret = DecodeChunk(exr_image, exr_header, offset_data, head, size, &e);
+
+    if (ret != TINYEXR_SUCCESS) {
+      if (!e.empty()) {
+        tinyexr::SetErrorMessage(e, err);
+      }
+
+#if 1
+      FreeEXRImage(exr_image);
+#else
+      // release memory(if exists)
+      if ((exr_header->num_channels > 0) && exr_image && exr_image->images) {
+        for (size_t c = 0; c < size_t(exr_header->num_channels); c++) {
+          if (exr_image->images[c]) {
+            free(exr_image->images[c]);
+            exr_image->images[c] = NULL;
+          }
+        }
+        free(exr_image->images);
+        exr_image->images = NULL;
+      }
+#endif
+    }
+
+    return ret;
+  }
+}
+
+static void GetLayers(const EXRHeader &exr_header,
+                      std::vector<std::string> &layer_names) {
+  // Naive implementation
+  // Group channels by layers
+  // go over all channel names, split by periods
+  // collect unique names
+  layer_names.clear();
+  for (int c = 0; c < exr_header.num_channels; c++) {
+    std::string full_name(exr_header.channels[c].name);
+    const size_t pos = full_name.find_last_of('.');
+    if (pos != std::string::npos && pos != 0 && pos + 1 < full_name.size()) {
+      full_name.erase(pos);
+      if (std::find(layer_names.begin(), layer_names.end(), full_name) ==
+          layer_names.end())
+        layer_names.push_back(full_name);
+    }
+  }
+}
+
+struct LayerChannel {
+  explicit LayerChannel(size_t i, std::string n) : index(i), name(n) {}
+  size_t index;
+  std::string name;
+};
+
+static void ChannelsInLayer(const EXRHeader &exr_header,
+                            const std::string &layer_name,
+                            std::vector<LayerChannel> &channels) {
+  channels.clear();
+  //std::cout << "layer_name = " << layer_name << "\n";
+  for (int c = 0; c < exr_header.num_channels; c++) {
+    //std::cout << "chan[" << c << "] = " << exr_header.channels[c].name << "\n";
+    std::string ch_name(exr_header.channels[c].name);
+    if (layer_name.empty()) {
+      const size_t pos = ch_name.find_last_of('.');
+      if (pos != std::string::npos && pos < ch_name.size()) {
+        if (pos != 0) continue;
+        ch_name = ch_name.substr(pos + 1);
+      }
+    } else {
+      const size_t pos = ch_name.find(layer_name + '.');
+      if (pos == std::string::npos) continue;
+      if (pos == 0) {
+        ch_name = ch_name.substr(layer_name.size() + 1);
+      }
+    }
+    LayerChannel ch(size_t(c), ch_name);
+    channels.push_back(ch);
+  }
+}
+
+}  // namespace tinyexr
+
+int EXRLayers(const char *filename, const char **layer_names[], int *num_layers,
+              const char **err) {
+  EXRVersion exr_version;
+  EXRHeader exr_header;
+  InitEXRHeader(&exr_header);
+
+  {
+    int ret = ParseEXRVersionFromFile(&exr_version, filename);
+    if (ret != TINYEXR_SUCCESS) {
+      tinyexr::SetErrorMessage("Invalid EXR header.", err);
+      return ret;
+    }
+
+    if (exr_version.multipart || exr_version.non_image) {
+      tinyexr::SetErrorMessage(
+          "Loading multipart or DeepImage is not supported  in LoadEXR() API",
+          err);
+      return TINYEXR_ERROR_INVALID_DATA;  // @fixme.
+    }
+  }
+
+  int ret = ParseEXRHeaderFromFile(&exr_header, &exr_version, filename, err);
+  if (ret != TINYEXR_SUCCESS) {
+    FreeEXRHeader(&exr_header);
+    return ret;
+  }
+
+  std::vector<std::string> layer_vec;
+  tinyexr::GetLayers(exr_header, layer_vec);
+
+  (*num_layers) = int(layer_vec.size());
+  (*layer_names) = static_cast<const char **>(
+      malloc(sizeof(const char *) * static_cast<size_t>(layer_vec.size())));
+  for (size_t c = 0; c < static_cast<size_t>(layer_vec.size()); c++) {
+#ifdef _MSC_VER
+    (*layer_names)[c] = _strdup(layer_vec[c].c_str());
+#else
+    (*layer_names)[c] = strdup(layer_vec[c].c_str());
+#endif
+  }
+
+  FreeEXRHeader(&exr_header);
+  return TINYEXR_SUCCESS;
+}
+
+int LoadEXR(float **out_rgba, int *width, int *height, const char *filename,
+            const char **err, int *num_chans) {
+  return LoadEXRWithLayer(out_rgba, width, height, filename,
+                          /* layername */ NULL, err, num_chans);
+}
+
+int LoadEXRWithLayer(float **out_rgba, int *width, int *height,
+                     const char *filename, const char *layername,
+                     const char **err, int *num_chans) {
+    if (num_chans)
+        *num_chans = 0;
+
+    if (out_rgba == NULL) {
+    tinyexr::SetErrorMessage("Invalid argument for LoadEXR()", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  EXRVersion exr_version;
+  EXRImage exr_image;
+  EXRHeader exr_header;
+  InitEXRHeader(&exr_header);
+  InitEXRImage(&exr_image);
+
+  {
+    int ret = ParseEXRVersionFromFile(&exr_version, filename);
+    if (ret != TINYEXR_SUCCESS) {
+      std::stringstream ss;
+      ss << "Failed to open EXR file or read version info from EXR file. code("
+         << ret << ")";
+      tinyexr::SetErrorMessage(ss.str(), err);
+      return ret;
+    }
+
+    if (exr_version.multipart || exr_version.non_image) {
+      tinyexr::SetErrorMessage(
+          "Loading multipart or DeepImage is not supported  in LoadEXR() API",
+          err);
+      return TINYEXR_ERROR_INVALID_DATA;  // @fixme.
+    }
+  }
+
+  {
+    int ret = ParseEXRHeaderFromFile(&exr_header, &exr_version, filename, err);
+    if (ret != TINYEXR_SUCCESS) {
+      FreeEXRHeader(&exr_header);
+      return ret;
+    }
+  }
+
+  // Read HALF channel as FLOAT.
+  for (int i = 0; i < exr_header.num_channels; i++) {
+    if (exr_header.pixel_types[i] == TINYEXR_PIXELTYPE_HALF) {
+      exr_header.requested_pixel_types[i] = TINYEXR_PIXELTYPE_FLOAT;
+    }
+  }
+
+  // TODO: Probably limit loading to layers (channels) selected by layer index
+  {
+    int ret = LoadEXRImageFromFile(&exr_image, &exr_header, filename, err);
+    if (ret != TINYEXR_SUCCESS) {
+      FreeEXRHeader(&exr_header);
+      return ret;
+    }
+  }
+
+  // RGBA
+  int idxR = -1;
+  int idxG = -1;
+  int idxB = -1;
+  int idxA = -1;
+
+  std::vector<std::string> layer_names;
+  tinyexr::GetLayers(exr_header, layer_names);
+
+  std::vector<tinyexr::LayerChannel> channels;
+  tinyexr::ChannelsInLayer(
+      exr_header, layername == NULL ? "" : std::string(layername), channels);
+
+
+  if (channels.size() < 1) {
+    if (layername == NULL) {
+      tinyexr::SetErrorMessage("Layer Not Found. Seems EXR contains channels with layer(e.g. `diffuse.R`). if you are using LoadEXR(), please try LoadEXRWithLayer(). LoadEXR() cannot load EXR having channels with layer.", err);
+
+    } else {
+      tinyexr::SetErrorMessage("Layer Not Found", err);
+    }
+    FreeEXRHeader(&exr_header);
+    FreeEXRImage(&exr_image);
+    return TINYEXR_ERROR_LAYER_NOT_FOUND;
+  }
+
+  size_t ch_count = channels.size() < 4 ? channels.size() : 4;
+  for (size_t c = 0; c < ch_count; c++) {
+    const tinyexr::LayerChannel &ch = channels[c];
+
+    if (ch.name == "R") {
+      idxR = int(ch.index);
+    } else if (ch.name == "G") {
+      idxG = int(ch.index);
+    } else if (ch.name == "B") {
+      idxB = int(ch.index);
+    } else if (ch.name == "A") {
+      idxA = int(ch.index);
+    }
+  }
+
+  if (channels.size() == 1) {
+    if (num_chans)
+      *num_chans = 1;
+
+    int chIdx = int(channels.front().index);
+    // Grayscale channel only.
+
+    (*out_rgba) = reinterpret_cast<float *>(
+        malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
+               static_cast<size_t>(exr_image.height)));
+
+    if (exr_header.tiled) {
+      const size_t tile_size_x = static_cast<size_t>(exr_header.tile_size_x);
+      const size_t tile_size_y = static_cast<size_t>(exr_header.tile_size_y);
+      for (int it = 0; it < exr_image.num_tiles; it++) {
+        for (size_t j = 0; j < tile_size_y; j++) {
+          for (size_t i = 0; i < tile_size_x; i++) {
+            const size_t ii =
+              static_cast<size_t>(exr_image.tiles[it].offset_x) * tile_size_x +
+              i;
+            const size_t jj =
+              static_cast<size_t>(exr_image.tiles[it].offset_y) * tile_size_y +
+              j;
+            const size_t idx = ii + jj * static_cast<size_t>(exr_image.width);
+
+            // out of region check.
+            if (ii >= static_cast<size_t>(exr_image.width)) {
+              continue;
+            }
+            if (jj >= static_cast<size_t>(exr_image.height)) {
+              continue;
+            }
+            const size_t srcIdx = i + j * tile_size_x;
+            unsigned char **src = exr_image.tiles[it].images;
+            (*out_rgba)[4 * idx + 0] =
+                reinterpret_cast<float **>(src)[chIdx][srcIdx];
+            (*out_rgba)[4 * idx + 1] =
+                reinterpret_cast<float **>(src)[chIdx][srcIdx];
+            (*out_rgba)[4 * idx + 2] =
+                reinterpret_cast<float **>(src)[chIdx][srcIdx];
+            (*out_rgba)[4 * idx + 3] =
+                reinterpret_cast<float **>(src)[chIdx][srcIdx];
+          }
+        }
+      }
+    } else {
+      const size_t pixel_size = static_cast<size_t>(exr_image.width) *
+        static_cast<size_t>(exr_image.height);
+      for (size_t i = 0; i < pixel_size; i++) {
+        const float val =
+            reinterpret_cast<float **>(exr_image.images)[chIdx][i];
+        (*out_rgba)[4 * i + 0] = val;
+        (*out_rgba)[4 * i + 1] = val;
+        (*out_rgba)[4 * i + 2] = val;
+        (*out_rgba)[4 * i + 3] = val;
+      }
+    }
+  } else {
+    // Assume RGB(A)
+
+    if (idxR == -1) {
+      tinyexr::SetErrorMessage("R channel not found", err);
+
+      FreeEXRHeader(&exr_header);
+      FreeEXRImage(&exr_image);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    if (idxG == -1) {
+      tinyexr::SetErrorMessage("G channel not found", err);
+      FreeEXRHeader(&exr_header);
+      FreeEXRImage(&exr_image);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    if (idxB == -1) {
+      tinyexr::SetErrorMessage("B channel not found", err);
+      FreeEXRHeader(&exr_header);
+      FreeEXRImage(&exr_image);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    if (num_chans)
+        *num_chans = (idxA != -1) ? 4 : 3;
+
+    (*out_rgba) = reinterpret_cast<float *>(
+        malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
+               static_cast<size_t>(exr_image.height)));
+    if (exr_header.tiled) {
+      const size_t tile_size_x = static_cast<size_t>(exr_header.tile_size_x);
+      const size_t tile_size_y = static_cast<size_t>(exr_header.tile_size_y);
+      for (int it = 0; it < exr_image.num_tiles; it++) {
+        for (size_t j = 0; j < tile_size_y; j++) {
+          for (size_t i = 0; i < tile_size_x; i++) {
+            const size_t ii =
+                static_cast<size_t>(exr_image.tiles[it].offset_x) *
+                    tile_size_x +
+                i;
+            const size_t jj =
+                static_cast<size_t>(exr_image.tiles[it].offset_y) *
+                    tile_size_y +
+                j;
+            const size_t idx = ii + jj * static_cast<size_t>(exr_image.width);
+
+            // out of region check.
+            if (ii >= static_cast<size_t>(exr_image.width)) {
+              continue;
+            }
+            if (jj >= static_cast<size_t>(exr_image.height)) {
+              continue;
+            }
+            const size_t srcIdx = i + j * tile_size_x;
+            unsigned char **src = exr_image.tiles[it].images;
+            (*out_rgba)[4 * idx + 0] =
+                reinterpret_cast<float **>(src)[idxR][srcIdx];
+            (*out_rgba)[4 * idx + 1] =
+                reinterpret_cast<float **>(src)[idxG][srcIdx];
+            (*out_rgba)[4 * idx + 2] =
+                reinterpret_cast<float **>(src)[idxB][srcIdx];
+            if (idxA != -1) {
+              (*out_rgba)[4 * idx + 3] =
+                  reinterpret_cast<float **>(src)[idxA][srcIdx];
+            } else {
+              (*out_rgba)[4 * idx + 3] = 1.0;
+            }
+          }
+        }
+      }
+    } else {
+      const size_t pixel_size = static_cast<size_t>(exr_image.width) *
+        static_cast<size_t>(exr_image.height);
+      for (size_t i = 0; i < pixel_size; i++) {
+        (*out_rgba)[4 * i + 0] =
+            reinterpret_cast<float **>(exr_image.images)[idxR][i];
+        (*out_rgba)[4 * i + 1] =
+            reinterpret_cast<float **>(exr_image.images)[idxG][i];
+        (*out_rgba)[4 * i + 2] =
+            reinterpret_cast<float **>(exr_image.images)[idxB][i];
+        if (idxA != -1) {
+          (*out_rgba)[4 * i + 3] =
+              reinterpret_cast<float **>(exr_image.images)[idxA][i];
+        } else {
+          (*out_rgba)[4 * i + 3] = 1.0;
+        }
+      }
+    }
+  }
+
+  (*width) = exr_image.width;
+  (*height) = exr_image.height;
+
+  FreeEXRHeader(&exr_header);
+  FreeEXRImage(&exr_image);
+
+  return TINYEXR_SUCCESS;
+}
+
+int IsEXR(const char *filename) {
+  EXRVersion exr_version;
+
+  int ret = ParseEXRVersionFromFile(&exr_version, filename);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int IsEXRFromMemory(const unsigned char *memory, size_t size) {
+  EXRVersion exr_version;
+
+  int ret = ParseEXRVersionFromMemory(&exr_version, memory, size);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int ParseEXRHeaderFromMemory(EXRHeader *exr_header, const EXRVersion *version,
+                             const unsigned char *memory, size_t size,
+                             const char **err) {
+  if (memory == NULL || exr_header == NULL) {
+    tinyexr::SetErrorMessage(
+        "Invalid argument. `memory` or `exr_header` argument is null in "
+        "ParseEXRHeaderFromMemory()",
+        err);
+
+    // Invalid argument
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (size < tinyexr::kEXRVersionSize) {
+    tinyexr::SetErrorMessage("Insufficient header/data size.\n", err);
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  const unsigned char *marker = memory + tinyexr::kEXRVersionSize;
+  size_t marker_size = size - tinyexr::kEXRVersionSize;
+
+  tinyexr::HeaderInfo info;
+  info.clear();
+
+  int ret;
+  {
+    std::string err_str;
+    ret = ParseEXRHeader(&info, NULL, version, &err_str, marker, marker_size);
+
+    if (ret != TINYEXR_SUCCESS) {
+      if (err && !err_str.empty()) {
+        tinyexr::SetErrorMessage(err_str, err);
+      }
+    }
+  }
+
+  {
+    std::string warn;
+    std::string err_str;
+
+    if (!ConvertHeader(exr_header, info, &warn, &err_str)) {
+      // release mem
+      for (size_t i = 0; i < info.attributes.size(); i++) {
+        if (info.attributes[i].value) {
+          free(info.attributes[i].value);
+        }
+      }
+      if (err && !err_str.empty()) {
+        tinyexr::SetErrorMessage(err_str, err);
+      }
+      ret = TINYEXR_ERROR_INVALID_HEADER;
+    }
+  }
+
+  exr_header->multipart = version->multipart ? 1 : 0;
+  exr_header->non_image = version->non_image ? 1 : 0;
+
+  return ret;
+}
+
+int LoadEXRFromMemory(float **out_rgba, int *width, int *height,
+                      const unsigned char *memory, size_t size,
+                      const char **err) {
+  if (out_rgba == NULL || memory == NULL) {
+    tinyexr::SetErrorMessage("Invalid argument for LoadEXRFromMemory", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  EXRVersion exr_version;
+  EXRImage exr_image;
+  EXRHeader exr_header;
+
+  InitEXRHeader(&exr_header);
+
+  int ret = ParseEXRVersionFromMemory(&exr_version, memory, size);
+  if (ret != TINYEXR_SUCCESS) {
+    std::stringstream ss;
+    ss << "Failed to parse EXR version. code(" << ret << ")";
+    tinyexr::SetErrorMessage(ss.str(), err);
+    return ret;
+  }
+
+  ret = ParseEXRHeaderFromMemory(&exr_header, &exr_version, memory, size, err);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  // Read HALF channel as FLOAT.
+  for (int i = 0; i < exr_header.num_channels; i++) {
+    if (exr_header.pixel_types[i] == TINYEXR_PIXELTYPE_HALF) {
+      exr_header.requested_pixel_types[i] = TINYEXR_PIXELTYPE_FLOAT;
+    }
+  }
+
+  InitEXRImage(&exr_image);
+  ret = LoadEXRImageFromMemory(&exr_image, &exr_header, memory, size, err);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  // RGBA
+  int idxR = -1;
+  int idxG = -1;
+  int idxB = -1;
+  int idxA = -1;
+  for (int c = 0; c < exr_header.num_channels; c++) {
+    if (strcmp(exr_header.channels[c].name, "R") == 0) {
+      idxR = c;
+    } else if (strcmp(exr_header.channels[c].name, "G") == 0) {
+      idxG = c;
+    } else if (strcmp(exr_header.channels[c].name, "B") == 0) {
+      idxB = c;
+    } else if (strcmp(exr_header.channels[c].name, "A") == 0) {
+      idxA = c;
+    }
+  }
+
+  // TODO(syoyo): Refactor removing same code as used in LoadEXR().
+  if (exr_header.num_channels == 1) {
+    // Grayscale channel only.
+
+    (*out_rgba) = reinterpret_cast<float *>(
+        malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
+               static_cast<size_t>(exr_image.height)));
+
+    if (exr_header.tiled) {
+      const size_t tile_size_x = static_cast<size_t>(exr_header.tile_size_x);
+      const size_t tile_size_y = static_cast<size_t>(exr_header.tile_size_y);
+      for (int it = 0; it < exr_image.num_tiles; it++) {
+        for (size_t j = 0; j < tile_size_y; j++) {
+          for (size_t i = 0; i < tile_size_x; i++) {
+            const size_t ii =
+                static_cast<size_t>(exr_image.tiles[it].offset_x) *
+                    tile_size_x +
+                i;
+            const size_t jj =
+                static_cast<size_t>(exr_image.tiles[it].offset_y) *
+                    tile_size_y +
+                j;
+            const size_t idx = ii + jj * static_cast<size_t>(exr_image.width);
+
+            // out of region check.
+            if (ii >= static_cast<size_t>(exr_image.width)) {
+              continue;
+            }
+            if (jj >= static_cast<size_t>(exr_image.height)) {
+              continue;
+            }
+            const size_t srcIdx = i + j * tile_size_x;
+            unsigned char **src = exr_image.tiles[it].images;
+            (*out_rgba)[4 * idx + 0] =
+                reinterpret_cast<float **>(src)[0][srcIdx];
+            (*out_rgba)[4 * idx + 1] =
+                reinterpret_cast<float **>(src)[0][srcIdx];
+            (*out_rgba)[4 * idx + 2] =
+                reinterpret_cast<float **>(src)[0][srcIdx];
+            (*out_rgba)[4 * idx + 3] =
+                reinterpret_cast<float **>(src)[0][srcIdx];
+          }
+        }
+      }
+    } else {
+      const size_t pixel_size = static_cast<size_t>(exr_image.width) *
+        static_cast<size_t>(exr_image.height);
+      for (size_t i = 0; i < pixel_size; i++) {
+        const float val = reinterpret_cast<float **>(exr_image.images)[0][i];
+        (*out_rgba)[4 * i + 0] = val;
+        (*out_rgba)[4 * i + 1] = val;
+        (*out_rgba)[4 * i + 2] = val;
+        (*out_rgba)[4 * i + 3] = val;
+      }
+    }
+
+  } else {
+    // TODO(syoyo): Support non RGBA image.
+
+    if (idxR == -1) {
+      tinyexr::SetErrorMessage("R channel not found", err);
+
+      // @todo { free exr_image }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    if (idxG == -1) {
+      tinyexr::SetErrorMessage("G channel not found", err);
+      // @todo { free exr_image }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    if (idxB == -1) {
+      tinyexr::SetErrorMessage("B channel not found", err);
+      // @todo { free exr_image }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    (*out_rgba) = reinterpret_cast<float *>(
+        malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
+               static_cast<size_t>(exr_image.height)));
+
+    if (exr_header.tiled) {
+      const size_t tile_size_x = static_cast<size_t>(exr_header.tile_size_x);
+      const size_t tile_size_y = static_cast<size_t>(exr_header.tile_size_y);
+      for (int it = 0; it < exr_image.num_tiles; it++) {
+        for (size_t j = 0; j < tile_size_y; j++)
+          for (size_t i = 0; i < tile_size_x; i++) {
+            const size_t ii =
+                static_cast<size_t>(exr_image.tiles[it].offset_x) *
+                    tile_size_x +
+                i;
+            const size_t jj =
+                static_cast<size_t>(exr_image.tiles[it].offset_y) *
+                    tile_size_y +
+                j;
+            const size_t idx = ii + jj * static_cast<size_t>(exr_image.width);
+
+            // out of region check.
+            if (ii >= static_cast<size_t>(exr_image.width)) {
+              continue;
+            }
+            if (jj >= static_cast<size_t>(exr_image.height)) {
+              continue;
+            }
+            const size_t srcIdx = i + j * tile_size_x;
+            unsigned char **src = exr_image.tiles[it].images;
+            (*out_rgba)[4 * idx + 0] =
+                reinterpret_cast<float **>(src)[idxR][srcIdx];
+            (*out_rgba)[4 * idx + 1] =
+                reinterpret_cast<float **>(src)[idxG][srcIdx];
+            (*out_rgba)[4 * idx + 2] =
+                reinterpret_cast<float **>(src)[idxB][srcIdx];
+            if (idxA != -1) {
+              (*out_rgba)[4 * idx + 3] =
+                  reinterpret_cast<float **>(src)[idxA][srcIdx];
+            } else {
+              (*out_rgba)[4 * idx + 3] = 1.0;
+            }
+          }
+      }
+    } else {
+      const size_t pixel_size = static_cast<size_t>(exr_image.width) *
+        static_cast<size_t>(exr_image.height);
+      for (size_t i = 0; i < pixel_size; i++) {
+        (*out_rgba)[4 * i + 0] =
+            reinterpret_cast<float **>(exr_image.images)[idxR][i];
+        (*out_rgba)[4 * i + 1] =
+            reinterpret_cast<float **>(exr_image.images)[idxG][i];
+        (*out_rgba)[4 * i + 2] =
+            reinterpret_cast<float **>(exr_image.images)[idxB][i];
+        if (idxA != -1) {
+          (*out_rgba)[4 * i + 3] =
+              reinterpret_cast<float **>(exr_image.images)[idxA][i];
+        } else {
+          (*out_rgba)[4 * i + 3] = 1.0;
+        }
+      }
+    }
+  }
+
+  (*width) = exr_image.width;
+  (*height) = exr_image.height;
+
+  FreeEXRHeader(&exr_header);
+  FreeEXRImage(&exr_image);
+
+  return TINYEXR_SUCCESS;
+}
+
+// Represents a read-only file mapped to an address space in memory.
+// If no memory-mapping API is available, falls back to allocating a buffer
+// with a copy of the file's data.
+struct MemoryMappedFile {
+  unsigned char *data;  // To the start of the file's data.
+  size_t size;          // The size of the file in bytes.
+#ifdef TINYEXR_USE_WIN32_MMAP
+  HANDLE windows_file;
+  HANDLE windows_file_mapping;
+#elif defined(TINYEXR_USE_POSIX_MMAP)
+  int posix_descriptor;
+#endif
+
+  // MemoryMappedFile's constructor tries to map memory to a file.
+  // If this succeeds, valid() will return true and all fields
+  // are usable; otherwise, valid() will return false.
+  MemoryMappedFile(const char *filename) {
+    data = NULL;
+    size = 0;
+#ifdef TINYEXR_USE_WIN32_MMAP
+    windows_file_mapping = NULL;
+    windows_file =
+        CreateFileW(tinyexr::UTF8ToWchar(filename).c_str(),  // lpFileName
+                    GENERIC_READ,                            // dwDesiredAccess
+                    FILE_SHARE_READ,                         // dwShareMode
+                    NULL,                     // lpSecurityAttributes
+                    OPEN_EXISTING,            // dwCreationDisposition
+                    FILE_ATTRIBUTE_READONLY,  // dwFlagsAndAttributes
+                    NULL);                    // hTemplateFile
+    if (windows_file == INVALID_HANDLE_VALUE) {
+      return;
+    }
+
+    windows_file_mapping = CreateFileMapping(windows_file,  // hFile
+                                             NULL,  // lpFileMappingAttributes
+                                             PAGE_READONLY,  // flProtect
+                                             0,      // dwMaximumSizeHigh
+                                             0,      // dwMaximumSizeLow
+                                             NULL);  // lpName
+    if (windows_file_mapping == NULL) {
+      return;
+    }
+
+    data = reinterpret_cast<unsigned char *>(
+        MapViewOfFile(windows_file_mapping,  // hFileMappingObject
+                      FILE_MAP_READ,         // dwDesiredAccess
+                      0,                     // dwFileOffsetHigh
+                      0,                     // dwFileOffsetLow
+                      0));                   // dwNumberOfBytesToMap
+    if (!data) {
+      return;
+    }
+
+    LARGE_INTEGER windows_file_size = {};
+    if (!GetFileSizeEx(windows_file, &windows_file_size) ||
+        static_cast<ULONGLONG>(windows_file_size.QuadPart) >
+            std::numeric_limits<size_t>::max()) {
+      UnmapViewOfFile(data);
+      data = NULL;
+      return;
+    }
+    size = static_cast<size_t>(windows_file_size.QuadPart);
+#elif defined(TINYEXR_USE_POSIX_MMAP)
+    posix_descriptor = open(filename, O_RDONLY);
+    if (posix_descriptor == -1) {
+      return;
+    }
+
+    struct stat info;
+    if (fstat(posix_descriptor, &info) < 0) {
+      return;
+    }
+    // Make sure st_size is in the valid range for a size_t. The second case
+    // can only fail if a POSIX implementation defines off_t to be a larger
+    // type than size_t - for instance, compiling with _FILE_OFFSET_BITS=64
+    // on a 32-bit system. On current 64-bit systems, this check can never
+    // fail, so we turn off clang's Wtautological-type-limit-compare warning
+    // around this code.
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-type-limit-compare"
+#endif
+    if (info.st_size < 0 ||
+        info.st_size > std::numeric_limits<ssize_t>::max()) {
+      return;
+    }
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+    size = static_cast<size_t>(info.st_size);
+
+    data = reinterpret_cast<unsigned char *>(
+        mmap(0, size, PROT_READ, MAP_SHARED, posix_descriptor, 0));
+    if (data == MAP_FAILED) {
+      data = nullptr;
+      return;
+    }
+#else
+    FILE *fp = fopen(filename, "rb");
+    if (!fp) {
+      return;
+    }
+
+    // Calling fseek(fp, 0, SEEK_END) isn't strictly-conforming C code, but
+    // since neither the WIN32 nor POSIX APIs are available in this branch, this
+    // is a reasonable fallback option.
+    if (fseek(fp, 0, SEEK_END) != 0) {
+      fclose(fp);
+      return;
+    }
+    const long ftell_result = ftell(fp);
+    if (ftell_result < 0) {
+      // Error from ftell
+      fclose(fp);
+      return;
+    }
+    size = static_cast<size_t>(ftell_result);
+    if (fseek(fp, 0, SEEK_SET) != 0) {
+      fclose(fp);
+      size = 0;
+      return;
+    }
+
+    data = reinterpret_cast<unsigned char *>(malloc(size));
+    if (!data) {
+      size = 0;
+      fclose(fp);
+      return;
+    }
+    size_t read_bytes = fread(data, 1, size, fp);
+    if (read_bytes != size) {
+      // TODO: Try to read data until reading `size` bytes.
+      fclose(fp);
+      size = 0; 
+      data = nullptr;
+      return;
+    }
+    fclose(fp);
+#endif
+  }
+
+  // MemoryMappedFile's destructor closes all its handles.
+  ~MemoryMappedFile() {
+#ifdef TINYEXR_USE_WIN32_MMAP
+    if (data) {
+      (void)UnmapViewOfFile(data);
+      data = NULL;
+    }
+
+    if (windows_file_mapping != NULL) {
+      (void)CloseHandle(windows_file_mapping);
+    }
+
+    if (windows_file != INVALID_HANDLE_VALUE) {
+      (void)CloseHandle(windows_file);
+    }
+#elif defined(TINYEXR_USE_POSIX_MMAP)
+    if (data) {
+      (void)munmap(data, size);
+      data = NULL;
+    }
+
+    if (posix_descriptor != -1) {
+      (void)close(posix_descriptor);
+    }
+#else
+    if (data) {
+      (void)free(data);
+    }
+    data = NULL;
+#endif
+  }
+
+  // A MemoryMappedFile cannot be copied or moved.
+  // Only check for this when compiling with C++11 or higher, since deleted
+  // function definitions were added then.
+#if TINYEXR_HAS_CXX11
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#endif
+  MemoryMappedFile(const MemoryMappedFile &) = delete;
+  MemoryMappedFile &operator=(const MemoryMappedFile &) = delete;
+  MemoryMappedFile(MemoryMappedFile &&other) noexcept = delete;
+  MemoryMappedFile &operator=(MemoryMappedFile &&other) noexcept = delete;
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+#endif
+
+  // Returns whether this was successfully opened.
+  bool valid() const { return data; }
+};
+
+int LoadEXRImageFromFile(EXRImage *exr_image, const EXRHeader *exr_header,
+                         const char *filename, const char **err) {
+  if (exr_image == NULL) {
+    tinyexr::SetErrorMessage("Invalid argument for LoadEXRImageFromFile", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  MemoryMappedFile file(filename);
+  if (!file.valid()) {
+    tinyexr::SetErrorMessage("Cannot read file " + std::string(filename), err);
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  if (file.size < 16) {
+    tinyexr::SetErrorMessage("File size too short : " + std::string(filename),
+                             err);
+    return TINYEXR_ERROR_INVALID_FILE;
+  }
+
+  return LoadEXRImageFromMemory(exr_image, exr_header, file.data, file.size,
+                                err);
+}
+
+int LoadEXRImageFromMemory(EXRImage *exr_image, const EXRHeader *exr_header,
+                           const unsigned char *memory, const size_t size,
+                           const char **err) {
+  if (exr_image == NULL || memory == NULL ||
+      (size < tinyexr::kEXRVersionSize)) {
+    tinyexr::SetErrorMessage("Invalid argument for LoadEXRImageFromMemory",
+                             err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (exr_header->header_len == 0) {
+    tinyexr::SetErrorMessage("EXRHeader variable is not initialized.", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  const unsigned char *head = memory;
+  const unsigned char *marker = reinterpret_cast<const unsigned char *>(
+      memory + exr_header->header_len +
+      8);  // +8 for magic number + version header.
+  return tinyexr::DecodeEXRImage(exr_image, exr_header, head, marker, size,
+                                 err);
+}
+
+namespace tinyexr
+{
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#endif
+
+// out_data must be allocated initially with the block-header size
+// of the current image(-part) type
+static bool EncodePixelData(/* out */ std::vector<unsigned char>& out_data,
+                            const unsigned char* const* images,
+                            int compression_type,
+                            int /*line_order*/,
+                            int width, // for tiled : tile.width
+                            int /*height*/, // for tiled : header.tile_size_y
+                            int x_stride, // for tiled : header.tile_size_x
+                            int line_no, // for tiled : 0
+                            int num_lines, // for tiled : tile.height
+                            size_t pixel_data_size,
+                            const std::vector<ChannelInfo>& channels,
+                            const std::vector<size_t>& channel_offset_list,
+                            std::string *err,
+                            const void* compression_param = 0) // zfp compression param
+{
+  size_t buf_size = static_cast<size_t>(width) *
+                  static_cast<size_t>(num_lines) *
+                  static_cast<size_t>(pixel_data_size);
+  //int last2bit = (buf_size & 3);
+  // buf_size must be multiple of four
+  //if(last2bit) buf_size += 4 - last2bit;
+  std::vector<unsigned char> buf(buf_size);
+
+  size_t start_y = static_cast<size_t>(line_no);
+  for (size_t c = 0; c < channels.size(); c++) {
+    if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      if (channels[c].requested_pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        for (int y = 0; y < num_lines; y++) {
+          // Assume increasing Y
+          float *line_ptr = reinterpret_cast<float *>(&buf.at(
+            static_cast<size_t>(pixel_data_size * size_t(y) * size_t(width)) +
+            channel_offset_list[c] *
+            static_cast<size_t>(width)));
+          for (int x = 0; x < width; x++) {
+            tinyexr::FP16 h16;
+            h16.u = reinterpret_cast<const unsigned short * const *>(
+              images)[c][(y + start_y) * size_t(x_stride) + size_t(x)];
+
+            tinyexr::FP32 f32 = half_to_float(h16);
+
+            tinyexr::swap4(&f32.f);
+
+            // line_ptr[x] = f32.f;
+            tinyexr::cpy4(line_ptr + x, &(f32.f));
+          }
+        }
+      } else if (channels[c].requested_pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        for (int y = 0; y < num_lines; y++) {
+          // Assume increasing Y
+          unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+            &buf.at(static_cast<size_t>(pixel_data_size * y *
+                                        width) +
+                    channel_offset_list[c] *
+                    static_cast<size_t>(width)));
+          for (int x = 0; x < width; x++) {
+            unsigned short val = reinterpret_cast<const unsigned short * const *>(
+              images)[c][(y + start_y) * x_stride + x];
+
+            tinyexr::swap2(&val);
+
+            // line_ptr[x] = val;
+            tinyexr::cpy2(line_ptr + x, &val);
+          }
+        }
+      } else {
+        if (err) {
+          (*err) += "Invalid requested_pixel_type.\n";
+        }
+        return false;
+      }
+
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+      if (channels[c].requested_pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        for (int y = 0; y < num_lines; y++) {
+          // Assume increasing Y
+          unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+            &buf.at(static_cast<size_t>(pixel_data_size * y *
+                                        width) +
+                    channel_offset_list[c] *
+                    static_cast<size_t>(width)));
+          for (int x = 0; x < width; x++) {
+            tinyexr::FP32 f32;
+            f32.f = reinterpret_cast<const float * const *>(
+              images)[c][(y + start_y) * x_stride + x];
+
+            tinyexr::FP16 h16;
+            h16 = float_to_half_full(f32);
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&h16.u));
+
+            // line_ptr[x] = h16.u;
+            tinyexr::cpy2(line_ptr + x, &(h16.u));
+          }
+        }
+      } else if (channels[c].requested_pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        for (int y = 0; y < num_lines; y++) {
+          // Assume increasing Y
+          float *line_ptr = reinterpret_cast<float *>(&buf.at(
+            static_cast<size_t>(pixel_data_size * y * width) +
+            channel_offset_list[c] *
+            static_cast<size_t>(width)));
+          for (int x = 0; x < width; x++) {
+            float val = reinterpret_cast<const float * const *>(
+              images)[c][(y + start_y) * x_stride + x];
+
+            tinyexr::swap4(&val);
+
+            // line_ptr[x] = val;
+            tinyexr::cpy4(line_ptr + x, &val);
+          }
+        }
+      } else {
+        if (err) {
+          (*err) += "Invalid requested_pixel_type.\n";
+        }
+        return false;
+      }
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+      for (int y = 0; y < num_lines; y++) {
+        // Assume increasing Y
+        unsigned int *line_ptr = reinterpret_cast<unsigned int *>(&buf.at(
+          static_cast<size_t>(pixel_data_size * y * width) +
+          channel_offset_list[c] * static_cast<size_t>(width)));
+        for (int x = 0; x < width; x++) {
+          unsigned int val = reinterpret_cast<const unsigned int * const *>(
+            images)[c][(y + start_y) * x_stride + x];
+
+          tinyexr::swap4(&val);
+
+          // line_ptr[x] = val;
+          tinyexr::cpy4(line_ptr + x, &val);
+        }
+      }
+    }
+  }
+
+  if (compression_type == TINYEXR_COMPRESSIONTYPE_NONE) {
+    // 4 byte: scan line
+    // 4 byte: data size
+    // ~     : pixel data(uncompressed)
+    out_data.insert(out_data.end(), buf.begin(), buf.end());
+
+  } else if ((compression_type == TINYEXR_COMPRESSIONTYPE_ZIPS) ||
+    (compression_type == TINYEXR_COMPRESSIONTYPE_ZIP)) {
+#if defined(TINYEXR_USE_MINIZ) && (TINYEXR_USE_MINIZ==1)
+    std::vector<unsigned char> block(buminiz::mz_compressBound(
+      static_cast<unsigned long>(buf.size())));
+#elif TINYEXR_USE_STB_ZLIB
+    // there is no compressBound() function, so we use a value that
+    // is grossly overestimated, but should always work
+    std::vector<unsigned char> block(256 + 2 * buf.size());
+#elif defined(TINYEXR_USE_NANOZLIB) && (TINYEXR_USE_NANOZLIB == 1)
+    std::vector<unsigned char> block(nanoz_compressBound(
+      static_cast<unsigned long>(buf.size())));
+#else
+    std::vector<unsigned char> block(
+      compressBound(static_cast<uLong>(buf.size())));
+#endif
+    tinyexr::tinyexr_uint64 outSize = block.size();
+
+    if (!tinyexr::CompressZip(&block.at(0), outSize,
+                         reinterpret_cast<const unsigned char *>(&buf.at(0)),
+                         static_cast<unsigned long>(buf.size()))) {
+      if (err) {
+        (*err) += "Zip compresssion failed.\n";
+      }
+      return false;
+    }
+
+    // 4 byte: scan line
+    // 4 byte: data size
+    // ~     : pixel data(compressed)
+    unsigned int data_len = static_cast<unsigned int>(outSize);  // truncate
+
+    out_data.insert(out_data.end(), block.begin(), block.begin() + data_len);
+
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_RLE) {
+    // (buf.size() * 3) / 2 would be enough.
+    std::vector<unsigned char> block((buf.size() * 3) / 2);
+
+    tinyexr::tinyexr_uint64 outSize = block.size();
+
+    if (!tinyexr::CompressRle(&block.at(0), outSize,
+                         reinterpret_cast<const unsigned char *>(&buf.at(0)),
+                         static_cast<unsigned long>(buf.size()))) {
+      if (err) {
+        (*err) += "RLE compresssion failed.\n";
+      }
+      return false;
+    }
+
+    // 4 byte: scan line
+    // 4 byte: data size
+    // ~     : pixel data(compressed)
+    unsigned int data_len = static_cast<unsigned int>(outSize);  // truncate
+    out_data.insert(out_data.end(), block.begin(), block.begin() + data_len);
+
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+#if TINYEXR_USE_PIZ
+    unsigned int bufLen =
+      8192 + static_cast<unsigned int>(
+        2 * static_cast<unsigned int>(
+          buf.size()));  // @fixme { compute good bound. }
+    std::vector<unsigned char> block(bufLen);
+    unsigned int outSize = static_cast<unsigned int>(block.size());
+
+    if (!CompressPiz(&block.at(0), &outSize,
+                reinterpret_cast<const unsigned char *>(&buf.at(0)),
+                buf.size(), channels, width, num_lines)) {
+      if (err) {
+        (*err) += "PIZ compresssion failed.\n";
+      }
+      return false;
+    }
+
+    // 4 byte: scan line
+    // 4 byte: data size
+    // ~     : pixel data(compressed)
+    unsigned int data_len = outSize;
+    out_data.insert(out_data.end(), block.begin(), block.begin() + data_len);
+
+#else
+    if (err) {
+      (*err) += "PIZ compression is disabled in this build.\n";
+    }
+    return false;
+#endif
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+#if TINYEXR_USE_ZFP
+    const ZFPCompressionParam* zfp_compression_param = reinterpret_cast<const ZFPCompressionParam*>(compression_param);
+    std::vector<unsigned char> block;
+    unsigned int outSize;
+
+    tinyexr::CompressZfp(
+      &block, &outSize, reinterpret_cast<const float *>(&buf.at(0)),
+      width, num_lines, static_cast<int>(channels.size()), *zfp_compression_param);
+
+    // 4 byte: scan line
+    // 4 byte: data size
+    // ~     : pixel data(compressed)
+    unsigned int data_len = outSize;
+    out_data.insert(out_data.end(), block.begin(), block.begin() + data_len);
+
+#else
+    if (err) {
+      (*err) += "ZFP compression is disabled in this build.\n";
+    }
+    (void)compression_param;
+    return false;
+#endif
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+static int EncodeTiledLevel(const EXRImage* level_image, const EXRHeader* exr_header,
+                            const std::vector<tinyexr::ChannelInfo>& channels,
+                            std::vector<std::vector<unsigned char> >& data_list,
+                            size_t start_index, // for data_list
+                            int num_x_tiles, int num_y_tiles,
+                            const std::vector<size_t>& channel_offset_list,
+                            int pixel_data_size,
+                            const void* compression_param, // must be set if zfp compression is enabled
+                            std::string* err) {
+  int num_tiles = num_x_tiles * num_y_tiles;
+  if (num_tiles != level_image->num_tiles) {
+    if (err) {
+      (*err) += "Invalid number of tiles in argument.\n";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if ((exr_header->tile_size_x > level_image->width || exr_header->tile_size_y > level_image->height) &&
+      level_image->level_x == 0 && level_image->level_y == 0) {
+      if (err) {
+        (*err) += "Failed to encode tile data.\n";
+      }
+      return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+  std::atomic<bool> invalid_data(false);
+#else
+  bool invalid_data(false);
+#endif
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+  std::vector<std::thread> workers;
+  std::atomic<int> tile_count(0);
+
+  int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
+  if (num_threads > int(num_tiles)) {
+    num_threads = int(num_tiles);
+  }
+
+  for (int t = 0; t < num_threads; t++) {
+    workers.emplace_back(std::thread([&]() {
+      int i = 0;
+      while ((i = tile_count++) < num_tiles) {
+
+#else
+  // Use signed int since some OpenMP compiler doesn't allow unsigned type for
+  // `parallel for`
+#if TINYEXR_USE_OPENMP
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < num_tiles; i++) {
+
+#endif
+    size_t tile_idx = static_cast<size_t>(i);
+    size_t data_idx = tile_idx + start_index;
+
+    int x_tile = i % num_x_tiles;
+    int y_tile = i / num_x_tiles;
+
+    EXRTile& tile = level_image->tiles[tile_idx];
+
+    const unsigned char* const* images =
+      static_cast<const unsigned char* const*>(tile.images);
+
+    data_list[data_idx].resize(5*sizeof(int));
+    size_t data_header_size = data_list[data_idx].size();
+    bool ret = EncodePixelData(data_list[data_idx],
+                               images,
+                               exr_header->compression_type,
+                               0, // increasing y
+                               tile.width,
+                               exr_header->tile_size_y,
+                               exr_header->tile_size_x,
+                               0,
+                               tile.height,
+                               pixel_data_size,
+                               channels,
+                               channel_offset_list,
+                               err, compression_param);
+    if (!ret) {
+      invalid_data = true;
+      continue;
+    }
+    if (data_list[data_idx].size() <= data_header_size) {
+      invalid_data = true;
+      continue;
+    }
+
+    int data_len = static_cast<int>(data_list[data_idx].size() - data_header_size);
+    //tileX, tileY, levelX, levelY // pixel_data_size(int)
+    memcpy(&data_list[data_idx][0], &x_tile, sizeof(int));
+    memcpy(&data_list[data_idx][4], &y_tile, sizeof(int));
+    memcpy(&data_list[data_idx][8], &level_image->level_x, sizeof(int));
+    memcpy(&data_list[data_idx][12], &level_image->level_y, sizeof(int));
+    memcpy(&data_list[data_idx][16], &data_len, sizeof(int));
+
+    swap4(reinterpret_cast<int*>(&data_list[data_idx][0]));
+    swap4(reinterpret_cast<int*>(&data_list[data_idx][4]));
+    swap4(reinterpret_cast<int*>(&data_list[data_idx][8]));
+    swap4(reinterpret_cast<int*>(&data_list[data_idx][12]));
+    swap4(reinterpret_cast<int*>(&data_list[data_idx][16]));
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+  }
+}));
+    }
+
+    for (auto &t : workers) {
+      t.join();
+    }
+#else
+    }  // omp parallel
+#endif
+
+  if (invalid_data) {
+    if (err) {
+      (*err) += "Failed to encode tile data.\n";
+    }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+  return TINYEXR_SUCCESS;
+}
+
+static int NumScanlines(int compression_type) {
+  int num_scanlines = 1;
+  if (compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+    num_scanlines = 16;
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    num_scanlines = 32;
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    num_scanlines = 16;
+  }
+  return num_scanlines;
+}
+
+static int EncodeChunk(const EXRImage* exr_image, const EXRHeader* exr_header,
+                       const std::vector<ChannelInfo>& channels,
+                       int num_blocks,
+                       tinyexr_uint64 chunk_offset, // starting offset of current chunk
+                       bool is_multipart,
+                       OffsetData& offset_data, // output block offsets, must be initialized
+                       std::vector<std::vector<unsigned char> >& data_list, // output
+                       tinyexr_uint64& total_size, // output: ending offset of current chunk
+                       std::string* err) {
+  int num_scanlines = NumScanlines(exr_header->compression_type);
+
+  data_list.resize(num_blocks);
+
+  std::vector<size_t> channel_offset_list(
+    static_cast<size_t>(exr_header->num_channels));
+
+  int pixel_data_size = 0;
+  {
+    size_t channel_offset = 0;
+    for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+      channel_offset_list[c] = channel_offset;
+      if (channels[c].requested_pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        pixel_data_size += sizeof(unsigned short);
+        channel_offset += sizeof(unsigned short);
+      } else if (channels[c].requested_pixel_type ==
+                 TINYEXR_PIXELTYPE_FLOAT) {
+        pixel_data_size += sizeof(float);
+        channel_offset += sizeof(float);
+      } else if (channels[c].requested_pixel_type == TINYEXR_PIXELTYPE_UINT) {
+        pixel_data_size += sizeof(unsigned int);
+        channel_offset += sizeof(unsigned int);
+      } else {
+        if (err) {
+          (*err) += "Invalid requested_pixel_type.\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+    }
+  }
+
+  const void* compression_param = 0;
+#if TINYEXR_USE_ZFP
+  tinyexr::ZFPCompressionParam zfp_compression_param;
+
+  // Use ZFP compression parameter from custom attributes(if such a parameter
+  // exists)
+  {
+    std::string e;
+    bool ret = tinyexr::FindZFPCompressionParam(
+      &zfp_compression_param, exr_header->custom_attributes,
+      exr_header->num_custom_attributes, &e);
+
+    if (!ret) {
+      // Use predefined compression parameter.
+      zfp_compression_param.type = 0;
+      zfp_compression_param.rate = 2;
+    }
+    compression_param = &zfp_compression_param;
+  }
+#endif
+
+  tinyexr_uint64 offset = chunk_offset;
+  tinyexr_uint64 doffset = is_multipart ? 4u : 0u;
+
+  if (exr_image->tiles) {
+    const EXRImage* level_image = exr_image;
+    size_t block_idx = 0;
+    //tinyexr::tinyexr_uint64 block_data_size = 0;
+    int num_levels = (exr_header->tile_level_mode != TINYEXR_TILE_RIPMAP_LEVELS) ?
+      offset_data.num_x_levels : (offset_data.num_x_levels * offset_data.num_y_levels);
+    for (int level_index = 0; level_index < num_levels; ++level_index) {
+      if (!level_image) {
+        if (err) {
+          (*err) += "Invalid number of tiled levels for EncodeChunk\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      int level_index_from_image = LevelIndex(level_image->level_x, level_image->level_y,
+                                    exr_header->tile_level_mode, offset_data.num_x_levels);
+      if (level_index_from_image < 0) {
+        if (err) {
+          (*err) += "Invalid tile level mode\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      if (level_index_from_image != level_index) {
+        if (err) {
+          (*err) += "Incorrect level ordering in tiled image\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+      int num_y_tiles = int(offset_data.offsets[level_index].size());
+      if (num_y_tiles <= 0) {
+        if (err) {
+          (*err) += "Invalid Y tile size\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      int num_x_tiles = int(offset_data.offsets[level_index][0].size());
+      if (num_x_tiles <= 0) {
+        if (err) {
+          (*err) += "Invalid X tile size\n";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      std::string e;
+      int ret = EncodeTiledLevel(level_image,
+                                  exr_header,
+                                  channels,
+                                  data_list,
+                                  block_idx,
+                                  num_x_tiles,
+                                  num_y_tiles,
+                                  channel_offset_list,
+                                  pixel_data_size,
+                                  compression_param,
+                                  &e);
+      if (ret != TINYEXR_SUCCESS) {
+        if (!e.empty() && err) {
+          (*err) += e;
+        }
+        return ret;
+      }
+
+      for (size_t j = 0; j < static_cast<size_t>(num_y_tiles); ++j)
+        for (size_t i = 0; i < static_cast<size_t>(num_x_tiles); ++i) {
+          offset_data.offsets[level_index][j][i] = offset;
+          swap8(reinterpret_cast<tinyexr_uint64*>(&offset_data.offsets[level_index][j][i]));
+          offset += data_list[block_idx].size() + doffset;
+          //block_data_size += data_list[block_idx].size();
+          ++block_idx;
+        }
+      level_image = level_image->next_level;
+    }
+    TINYEXR_CHECK_AND_RETURN_C(static_cast<int>(block_idx) == num_blocks, TINYEXR_ERROR_INVALID_DATA);
+    total_size = offset;
+  } else { // scanlines
+    std::vector<tinyexr::tinyexr_uint64>& offsets = offset_data.offsets[0][0];
+
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+    std::atomic<bool> invalid_data(false);
+    std::vector<std::thread> workers;
+    std::atomic<int> block_count(0);
+
+    int num_threads = std::min(std::max(1, int(std::thread::hardware_concurrency())), num_blocks);
+
+    for (int t = 0; t < num_threads; t++) {
+      workers.emplace_back(std::thread([&]() {
+        int i = 0;
+        while ((i = block_count++) < num_blocks) {
+
+#else
+    bool invalid_data(false);
+#if TINYEXR_USE_OPENMP
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < num_blocks; i++) {
+
+#endif
+      int start_y = num_scanlines * i;
+      int end_Y = (std::min)(num_scanlines * (i + 1), exr_image->height);
+      int num_lines = end_Y - start_y;
+
+      const unsigned char* const* images =
+        static_cast<const unsigned char* const*>(exr_image->images);
+
+      data_list[i].resize(2*sizeof(int));
+      size_t data_header_size = data_list[i].size();
+
+      bool ret = EncodePixelData(data_list[i],
+                                 images,
+                                 exr_header->compression_type,
+                                 0, // increasing y
+                                 exr_image->width,
+                                 exr_image->height,
+                                 exr_image->width,
+                                 start_y,
+                                 num_lines,
+                                 pixel_data_size,
+                                 channels,
+                                 channel_offset_list,
+                                 err,
+                                 compression_param);
+      if (!ret) {
+        invalid_data = true;
+        continue; // "break" cannot be used with OpenMP
+      }
+      if (data_list[i].size() <= data_header_size) {
+        invalid_data = true;
+        continue; // "break" cannot be used with OpenMP
+      }
+      int data_len = static_cast<int>(data_list[i].size() - data_header_size);
+      memcpy(&data_list[i][0], &start_y, sizeof(int));
+      memcpy(&data_list[i][4], &data_len, sizeof(int));
+
+      swap4(reinterpret_cast<int*>(&data_list[i][0]));
+      swap4(reinterpret_cast<int*>(&data_list[i][4]));
+#if TINYEXR_HAS_CXX11 && (TINYEXR_USE_THREAD > 0)
+        }
+                                       }));
+    }
+
+    for (auto &t : workers) {
+      t.join();
+    }
+#else
+    }  // omp parallel
+#endif
+
+    if (invalid_data) {
+      if (err) {
+        (*err) += "Failed to encode scanline data.\n";
+      }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    for (size_t i = 0; i < static_cast<size_t>(num_blocks); i++) {
+      offsets[i] = offset;
+      tinyexr::swap8(reinterpret_cast<tinyexr::tinyexr_uint64 *>(&offsets[i]));
+      offset += data_list[i].size() + doffset;
+    }
+
+    total_size = static_cast<size_t>(offset);
+  }
+  return TINYEXR_SUCCESS;
+}
+
+// can save a single or multi-part image (no deep* formats)
+static size_t SaveEXRNPartImageToMemory(const EXRImage* exr_images,
+                                        const EXRHeader** exr_headers,
+                                        unsigned int num_parts,
+                                        unsigned char** memory_out, const char** err) {
+  if (exr_images == NULL || exr_headers == NULL || num_parts == 0 ||
+      memory_out == NULL) {
+    SetErrorMessage("Invalid argument for SaveEXRNPartImageToMemory",
+                    err);
+    return 0;
+  }
+  {
+    for (unsigned int i = 0; i < num_parts; ++i) {
+      if (exr_headers[i]->compression_type < 0) {
+        SetErrorMessage("Invalid argument for SaveEXRNPartImageToMemory",
+                        err);
+        return 0;
+      }
+#if !TINYEXR_USE_PIZ
+      if (exr_headers[i]->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+        SetErrorMessage("PIZ compression is not supported in this build",
+                        err);
+        return 0;
+      }
+#endif
+      if (exr_headers[i]->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+#if !TINYEXR_USE_ZFP
+        SetErrorMessage("ZFP compression is not supported in this build",
+                        err);
+        return 0;
+#else
+        // All channels must be fp32.
+        // No fp16 support in ZFP atm(as of 2023 June)
+        // https://github.com/LLNL/fpzip/issues/2
+        for (int c = 0; c < exr_headers[i]->num_channels; ++c) {
+          if (exr_headers[i]->requested_pixel_types[c] != TINYEXR_PIXELTYPE_FLOAT) {
+            SetErrorMessage("Pixel type must be FLOAT for ZFP compression",
+                            err);
+            return 0;
+          }
+        }
+#endif
+      }
+    }
+  }
+
+  std::vector<unsigned char> memory;
+
+  // Header
+  {
+    const char header[] = { 0x76, 0x2f, 0x31, 0x01 };
+    memory.insert(memory.end(), header, header + 4);
+  }
+
+  // Version
+  // using value from the first header
+  int long_name = exr_headers[0]->long_name;
+  {
+    char marker[] = { 2, 0, 0, 0 };
+    /* @todo
+    if (exr_header->non_image) {
+    marker[1] |= 0x8;
+    }
+    */
+    // tiled
+    if (num_parts == 1 && exr_images[0].tiles) {
+      marker[1] |= 0x2;
+    }
+    // long_name
+    if (long_name) {
+      marker[1] |= 0x4;
+    }
+    // multipart
+    if (num_parts > 1) {
+      marker[1] |= 0x10;
+    }
+    memory.insert(memory.end(), marker, marker + 4);
+  }
+
+  int total_chunk_count = 0;
+  std::vector<int> chunk_count(num_parts);
+  std::vector<OffsetData> offset_data(num_parts);
+  for (unsigned int i = 0; i < num_parts; ++i) {
+    if (!exr_images[i].tiles) {
+      int num_scanlines = NumScanlines(exr_headers[i]->compression_type);
+      chunk_count[i] =
+        (exr_images[i].height + num_scanlines - 1) / num_scanlines;
+      InitSingleResolutionOffsets(offset_data[i], chunk_count[i]);
+      total_chunk_count += chunk_count[i];
+    } else {
+      {
+        std::vector<int> num_x_tiles, num_y_tiles;
+        if (!PrecalculateTileInfo(num_x_tiles, num_y_tiles, exr_headers[i])) {
+          SetErrorMessage("Failed to precalculate Tile info",
+                          err);
+          return (size_t)TINYEXR_ERROR_INVALID_DATA;
+        }
+        int ntiles = InitTileOffsets(offset_data[i], exr_headers[i], num_x_tiles, num_y_tiles);
+        if (ntiles > 0) {
+          chunk_count[i] = ntiles;
+        } else {
+          SetErrorMessage("Failed to compute Tile offsets",
+                          err);
+          return (size_t)TINYEXR_ERROR_INVALID_DATA;
+          
+        }
+        total_chunk_count += chunk_count[i];
+      }
+    }
+  }
+  // Write attributes to memory buffer.
+  std::vector< std::vector<tinyexr::ChannelInfo> > channels(num_parts);
+  {
+    std::set<std::string> partnames;
+    for (unsigned int i = 0; i < num_parts; ++i) {
+      //channels
+      {
+        std::vector<unsigned char> data;
+
+        for (int c = 0; c < exr_headers[i]->num_channels; c++) {
+          tinyexr::ChannelInfo info;
+          info.p_linear = 0;
+          info.pixel_type = exr_headers[i]->pixel_types[c];
+          info.requested_pixel_type = exr_headers[i]->requested_pixel_types[c];
+          info.x_sampling = 1;
+          info.y_sampling = 1;
+          info.name = std::string(exr_headers[i]->channels[c].name);
+          channels[i].push_back(info);
+        }
+
+        tinyexr::WriteChannelInfo(data, channels[i]);
+
+        tinyexr::WriteAttributeToMemory(&memory, "channels", "chlist", &data.at(0),
+                                        static_cast<int>(data.size()));
+      }
+
+      {
+        int comp = exr_headers[i]->compression_type;
+        swap4(&comp);
+        WriteAttributeToMemory(
+          &memory, "compression", "compression",
+          reinterpret_cast<const unsigned char*>(&comp), 1);
+      }
+
+      {
+        int data[4] = { 0, 0, exr_images[i].width - 1, exr_images[i].height - 1 };
+        swap4(&data[0]);
+        swap4(&data[1]);
+        swap4(&data[2]);
+        swap4(&data[3]);
+        WriteAttributeToMemory(
+          &memory, "dataWindow", "box2i",
+          reinterpret_cast<const unsigned char*>(data), sizeof(int) * 4);
+
+        int data0[4] = { 0, 0, exr_images[0].width - 1, exr_images[0].height - 1 };
+        swap4(&data0[0]);
+        swap4(&data0[1]);
+        swap4(&data0[2]);
+        swap4(&data0[3]);
+        // Note: must be the same across parts (currently, using value from the first header)
+        WriteAttributeToMemory(
+          &memory, "displayWindow", "box2i",
+          reinterpret_cast<const unsigned char*>(data0), sizeof(int) * 4);
+      }
+
+      {
+        unsigned char line_order = 0;  // @fixme { read line_order from EXRHeader }
+        WriteAttributeToMemory(&memory, "lineOrder", "lineOrder",
+                               &line_order, 1);
+      }
+
+      {
+        // Note: must be the same across parts
+        float aspectRatio = 1.0f;
+        swap4(&aspectRatio);
+        WriteAttributeToMemory(
+          &memory, "pixelAspectRatio", "float",
+          reinterpret_cast<const unsigned char*>(&aspectRatio), sizeof(float));
+      }
+
+      {
+        float center[2] = { 0.0f, 0.0f };
+        swap4(&center[0]);
+        swap4(&center[1]);
+        WriteAttributeToMemory(
+          &memory, "screenWindowCenter", "v2f",
+          reinterpret_cast<const unsigned char*>(center), 2 * sizeof(float));
+      }
+
+      {
+        float w = 1.0f;
+        swap4(&w);
+        WriteAttributeToMemory(&memory, "screenWindowWidth", "float",
+                               reinterpret_cast<const unsigned char*>(&w),
+                               sizeof(float));
+      }
+
+      if (exr_images[i].tiles) {
+        unsigned char tile_mode = static_cast<unsigned char>(exr_headers[i]->tile_level_mode & 0x3);
+        if (exr_headers[i]->tile_rounding_mode) tile_mode |= (1u << 4u);
+        //unsigned char data[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+        unsigned int datai[3] = { 0, 0, 0 };
+        unsigned char* data = reinterpret_cast<unsigned char*>(&datai[0]);
+        datai[0] = static_cast<unsigned int>(exr_headers[i]->tile_size_x);
+        datai[1] = static_cast<unsigned int>(exr_headers[i]->tile_size_y);
+        data[8] = tile_mode;
+        swap4(reinterpret_cast<unsigned int*>(&data[0]));
+        swap4(reinterpret_cast<unsigned int*>(&data[4]));
+        WriteAttributeToMemory(
+          &memory, "tiles", "tiledesc",
+          reinterpret_cast<const unsigned char*>(data), 9);
+      }
+
+      // must be present for multi-part files - according to spec.
+      if (num_parts > 1) {
+        // name
+        {
+          size_t len = 0;
+          if ((len = strlen(exr_headers[i]->name)) > 0) {
+#if TINYEXR_HAS_CXX11
+            partnames.emplace(exr_headers[i]->name);
+#else
+            partnames.insert(std::string(exr_headers[i]->name));
+#endif
+            if (partnames.size() != i + 1) {
+              SetErrorMessage("'name' attributes must be unique for a multi-part file", err);
+              return 0;
+            }
+            WriteAttributeToMemory(
+              &memory, "name", "string",
+              reinterpret_cast<const unsigned char*>(exr_headers[i]->name),
+              static_cast<int>(len));
+          } else {
+            SetErrorMessage("Invalid 'name' attribute for a multi-part file", err);
+            return 0;
+          }
+        }
+        // type
+        {
+          const char* type = "scanlineimage";
+          if (exr_images[i].tiles) type = "tiledimage";
+          WriteAttributeToMemory(
+            &memory, "type", "string",
+            reinterpret_cast<const unsigned char*>(type),
+            static_cast<int>(strlen(type)));
+        }
+        // chunkCount
+        {
+          WriteAttributeToMemory(
+            &memory, "chunkCount", "int",
+            reinterpret_cast<const unsigned char*>(&chunk_count[i]),
+            4);
+        }
+      }
+
+      // Custom attributes
+      if (exr_headers[i]->num_custom_attributes > 0) {
+        for (int j = 0; j < exr_headers[i]->num_custom_attributes; j++) {
+          tinyexr::WriteAttributeToMemory(
+            &memory, exr_headers[i]->custom_attributes[j].name,
+            exr_headers[i]->custom_attributes[j].type,
+            reinterpret_cast<const unsigned char*>(
+              exr_headers[i]->custom_attributes[j].value),
+            exr_headers[i]->custom_attributes[j].size);
+        }
+      }
+
+      {  // end of header
+        memory.push_back(0);
+      }
+    }
+  }
+  if (num_parts > 1) {
+    // end of header list
+    memory.push_back(0);
+  }
+
+  tinyexr_uint64 chunk_offset = memory.size() + size_t(total_chunk_count) * sizeof(tinyexr_uint64);
+
+  tinyexr_uint64 total_size = 0;
+  std::vector< std::vector< std::vector<unsigned char> > > data_lists(num_parts);
+  for (unsigned int i = 0; i < num_parts; ++i) {
+    std::string e;
+    int ret = EncodeChunk(&exr_images[i], exr_headers[i],
+                          channels[i],
+                          chunk_count[i],
+                          // starting offset of current chunk after part-number
+                          chunk_offset,
+                          num_parts > 1,
+                          offset_data[i], // output: block offsets, must be initialized
+                          data_lists[i], // output
+                          total_size, // output
+                          &e);
+    if (ret != TINYEXR_SUCCESS) {
+      if (!e.empty()) {
+        tinyexr::SetErrorMessage(e, err);
+      }
+      return 0;
+    }
+    chunk_offset = total_size;
+  }
+
+  // Allocating required memory
+  if (total_size == 0) { // something went wrong
+    tinyexr::SetErrorMessage("Output memory size is zero", err);
+    return (size_t)TINYEXR_ERROR_INVALID_DATA;
+  }
+  (*memory_out) = static_cast<unsigned char*>(malloc(size_t(total_size)));
+
+  // Writing header
+  memcpy((*memory_out), &memory[0], memory.size());
+  unsigned char* memory_ptr = *memory_out + memory.size();
+  size_t sum = memory.size();
+
+  // Writing offset data for chunks
+  for (unsigned int i = 0; i < num_parts; ++i) {
+    if (exr_images[i].tiles) {
+      const EXRImage* level_image = &exr_images[i];
+      int num_levels = (exr_headers[i]->tile_level_mode != TINYEXR_TILE_RIPMAP_LEVELS) ?
+        offset_data[i].num_x_levels : (offset_data[i].num_x_levels * offset_data[i].num_y_levels);
+      for (int level_index = 0; level_index < num_levels; ++level_index) {
+        for (size_t j = 0; j < offset_data[i].offsets[level_index].size(); ++j) {
+          size_t num_bytes = sizeof(tinyexr_uint64) * offset_data[i].offsets[level_index][j].size();
+          sum += num_bytes;
+          if (sum > total_size) {
+            tinyexr::SetErrorMessage("Invalid offset bytes in Tiled Part image.", err);
+            return (size_t)TINYEXR_ERROR_INVALID_DATA;
+          }
+
+          memcpy(memory_ptr,
+                 reinterpret_cast<unsigned char*>(&offset_data[i].offsets[level_index][j][0]),
+                 num_bytes);
+          memory_ptr += num_bytes;
+        }
+        level_image = level_image->next_level;
+      }
+    } else {
+      size_t num_bytes = sizeof(tinyexr::tinyexr_uint64) * static_cast<size_t>(chunk_count[i]);
+      sum += num_bytes;
+      if (sum > total_size) {
+        tinyexr::SetErrorMessage("Invalid offset bytes in Part image.", err);
+        return (size_t)TINYEXR_ERROR_INVALID_DATA;
+      }
+      std::vector<tinyexr::tinyexr_uint64>& offsets = offset_data[i].offsets[0][0];
+      memcpy(memory_ptr, reinterpret_cast<unsigned char*>(&offsets[0]), num_bytes);
+      memory_ptr += num_bytes;
+    }
+  }
+
+  // Writing chunk data
+  for (unsigned int i = 0; i < num_parts; ++i) {
+    for (size_t j = 0; j < static_cast<size_t>(chunk_count[i]); ++j) {
+      if (num_parts > 1) {
+        sum += 4;
+        if (sum > total_size) {
+          tinyexr::SetErrorMessage("Buffer overrun in reading Part image chunk data.", err);
+          return (size_t)TINYEXR_ERROR_INVALID_DATA;
+        }
+        unsigned int part_number = i;
+        swap4(&part_number);
+        memcpy(memory_ptr, &part_number, 4);
+        memory_ptr += 4;
+      }
+      sum += data_lists[i][j].size();
+      if (sum > total_size) {
+        tinyexr::SetErrorMessage("Buffer overrun in reading Part image chunk data.", err);
+        return (size_t)TINYEXR_ERROR_INVALID_DATA;
+      }
+      memcpy(memory_ptr, &data_lists[i][j][0], data_lists[i][j].size());
+      memory_ptr += data_lists[i][j].size();
+    }
+  }
+
+  if (sum != total_size) {
+    tinyexr::SetErrorMessage("Corrupted Part image chunk data.", err);
+    return (size_t)TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  return size_t(total_size);  // OK
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+} // tinyexr
+
+size_t SaveEXRImageToMemory(const EXRImage* exr_image,
+                             const EXRHeader* exr_header,
+                             unsigned char** memory_out, const char** err) {
+  return tinyexr::SaveEXRNPartImageToMemory(exr_image, &exr_header, 1, memory_out, err);
+}
+
+int SaveEXRImageToFile(const EXRImage *exr_image, const EXRHeader *exr_header,
+                       const char *filename, const char **err) {
+  if (exr_image == NULL || filename == NULL ||
+      exr_header->compression_type < 0) {
+    tinyexr::SetErrorMessage("Invalid argument for SaveEXRImageToFile", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+#if !TINYEXR_USE_PIZ
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    tinyexr::SetErrorMessage("PIZ compression is not supported in this build",
+                             err);
+    return TINYEXR_ERROR_UNSUPPORTED_FEATURE;
+  }
+#endif
+
+#if !TINYEXR_USE_ZFP
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    tinyexr::SetErrorMessage("ZFP compression is not supported in this build",
+                             err);
+    return TINYEXR_ERROR_UNSUPPORTED_FEATURE;
+  }
+#endif
+
+  FILE *fp = NULL;
+#ifdef _WIN32
+#if defined(_MSC_VER) || (defined(MINGW_HAS_SECURE_API) && MINGW_HAS_SECURE_API) // MSVC, MinGW GCC, or Clang
+  errno_t errcode =
+      _wfopen_s(&fp, tinyexr::UTF8ToWchar(filename).c_str(), L"wb");
+  if (errcode != 0) {
+    tinyexr::SetErrorMessage("Cannot write a file: " + std::string(filename),
+                             err);
+    return TINYEXR_ERROR_CANT_WRITE_FILE;
+  }
+#else
+  // Unknown compiler or MinGW without MINGW_HAS_SECURE_API.
+  fp = fopen(filename, "wb");
+#endif
+#else
+  fp = fopen(filename, "wb");
+#endif
+  if (!fp) {
+    tinyexr::SetErrorMessage("Cannot write a file: " + std::string(filename),
+                             err);
+    return TINYEXR_ERROR_CANT_WRITE_FILE;
+  }
+
+  unsigned char *mem = NULL;
+  size_t mem_size = SaveEXRImageToMemory(exr_image, exr_header, &mem, err);
+  if (mem_size == 0) {
+    fclose(fp);
+    return TINYEXR_ERROR_SERIALIZATION_FAILED;
+  }
+
+  size_t written_size = 0;
+  if ((mem_size > 0) && mem) {
+    written_size = fwrite(mem, 1, mem_size, fp);
+  }
+  free(mem);
+
+  fclose(fp);
+
+  if (written_size != mem_size) {
+    tinyexr::SetErrorMessage("Cannot write a file", err);
+    return TINYEXR_ERROR_CANT_WRITE_FILE;
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+size_t SaveEXRMultipartImageToMemory(const EXRImage* exr_images,
+                                     const EXRHeader** exr_headers,
+                                     unsigned int num_parts,
+                                     unsigned char** memory_out, const char** err) {
+  if (exr_images == NULL || exr_headers == NULL || num_parts < 2 ||
+      memory_out == NULL) {
+    tinyexr::SetErrorMessage("Invalid argument for SaveEXRNPartImageToMemory",
+                              err);
+    return 0;
+  }
+  return tinyexr::SaveEXRNPartImageToMemory(exr_images, exr_headers, num_parts, memory_out, err);
+}
+
+int SaveEXRMultipartImageToFile(const EXRImage* exr_images,
+                                const EXRHeader** exr_headers,
+                                unsigned int num_parts,
+                                const char* filename,
+                                const char** err) {
+  if (exr_images == NULL || exr_headers == NULL || num_parts < 2) {
+    tinyexr::SetErrorMessage("Invalid argument for SaveEXRMultipartImageToFile",
+                              err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  FILE *fp = NULL;
+#ifdef _WIN32
+#if defined(_MSC_VER) || (defined(MINGW_HAS_SECURE_API) && MINGW_HAS_SECURE_API) // MSVC, MinGW GCC, or Clang.
+  errno_t errcode =
+    _wfopen_s(&fp, tinyexr::UTF8ToWchar(filename).c_str(), L"wb");
+  if (errcode != 0) {
+    tinyexr::SetErrorMessage("Cannot write a file: " + std::string(filename),
+                             err);
+    return TINYEXR_ERROR_CANT_WRITE_FILE;
+  }
+#else
+  // Unknown compiler or MinGW without MINGW_HAS_SECURE_API.
+  fp = fopen(filename, "wb");
+#endif
+#else
+  fp = fopen(filename, "wb");
+#endif
+  if (!fp) {
+    tinyexr::SetErrorMessage("Cannot write a file: " + std::string(filename),
+                             err);
+    return TINYEXR_ERROR_CANT_WRITE_FILE;
+  }
+
+  unsigned char *mem = NULL;
+  size_t mem_size = SaveEXRMultipartImageToMemory(exr_images, exr_headers, num_parts, &mem, err);
+  if (mem_size == 0) {
+    fclose(fp);
+    return TINYEXR_ERROR_SERIALIZATION_FAILED;
+  }
+
+  size_t written_size = 0;
+  if ((mem_size > 0) && mem) {
+    written_size = fwrite(mem, 1, mem_size, fp);
+  }
+  free(mem);
+
+  fclose(fp);
+
+  if (written_size != mem_size) {
+    tinyexr::SetErrorMessage("Cannot write a file", err);
+    return TINYEXR_ERROR_CANT_WRITE_FILE;
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
+  if (deep_image == NULL) {
+    tinyexr::SetErrorMessage("Invalid argument for LoadDeepEXR", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  MemoryMappedFile file(filename);
+  if (!file.valid()) {
+    tinyexr::SetErrorMessage("Cannot read file " + std::string(filename), err);
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  if (file.size == 0) {
+    tinyexr::SetErrorMessage("File size is zero : " + std::string(filename),
+                             err);
+    return TINYEXR_ERROR_INVALID_FILE;
+  }
+
+  const char *head = reinterpret_cast<const char *>(file.data);
+  const char *marker = reinterpret_cast<const char *>(file.data);
+
+  // Header check.
+  {
+    const char header[] = {0x76, 0x2f, 0x31, 0x01};
+
+    if (memcmp(marker, header, 4) != 0) {
+      tinyexr::SetErrorMessage("Invalid magic number", err);
+      return TINYEXR_ERROR_INVALID_MAGIC_NUMBER;
+    }
+    marker += 4;
+  }
+
+  // Version, scanline.
+  {
+    // ver 2.0, scanline, deep bit on(0x800)
+    // must be [2, 0, 0, 0]
+    if (marker[0] != 2 || marker[1] != 8 || marker[2] != 0 || marker[3] != 0) {
+      tinyexr::SetErrorMessage("Unsupported version or scanline", err);
+      return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+    }
+
+    marker += 4;
+  }
+
+  int dx = -1;
+  int dy = -1;
+  int dw = -1;
+  int dh = -1;
+  int num_scanline_blocks = 1;  // 16 for ZIP compression.
+  int compression_type = -1;
+  int num_channels = -1;
+  std::vector<tinyexr::ChannelInfo> channels;
+
+  // Read attributes
+  size_t size = file.size - tinyexr::kEXRVersionSize;
+  for (;;) {
+    if (0 == size) {
+      return TINYEXR_ERROR_INVALID_DATA;
+    } else if (marker[0] == '\0') {
+      marker++;
+      size--;
+      break;
+    }
+
+    std::string attr_name;
+    std::string attr_type;
+    std::vector<unsigned char> data;
+    size_t marker_size;
+    if (!tinyexr::ReadAttribute(&attr_name, &attr_type, &data, &marker_size,
+                                marker, size)) {
+      std::stringstream ss;
+      ss << "Failed to parse attribute\n";
+      tinyexr::SetErrorMessage(ss.str(), err);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+    marker += marker_size;
+    size -= marker_size;
+
+    if (attr_name.compare("compression") == 0) {
+      compression_type = data[0];
+      if (compression_type > TINYEXR_COMPRESSIONTYPE_PIZ) {
+        std::stringstream ss;
+        ss << "Unsupported compression type : " << compression_type;
+        tinyexr::SetErrorMessage(ss.str(), err);
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+      }
+
+      if (compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+        num_scanline_blocks = 16;
+      }
+
+    } else if (attr_name.compare("channels") == 0) {
+      // name: zero-terminated string, from 1 to 255 bytes long
+      // pixel type: int, possible values are: UINT = 0 HALF = 1 FLOAT = 2
+      // pLinear: unsigned char, possible values are 0 and 1
+      // reserved: three chars, should be zero
+      // xSampling: int
+      // ySampling: int
+
+      if (!tinyexr::ReadChannelInfo(channels, data)) {
+        tinyexr::SetErrorMessage("Failed to parse channel info", err);
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      num_channels = static_cast<int>(channels.size());
+
+      if (num_channels < 1) {
+        tinyexr::SetErrorMessage("Invalid channels format", err);
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+    } else if (attr_name.compare("dataWindow") == 0) {
+      memcpy(&dx, &data.at(0), sizeof(int));
+      memcpy(&dy, &data.at(4), sizeof(int));
+      memcpy(&dw, &data.at(8), sizeof(int));
+      memcpy(&dh, &data.at(12), sizeof(int));
+      tinyexr::swap4(&dx);
+      tinyexr::swap4(&dy);
+      tinyexr::swap4(&dw);
+      tinyexr::swap4(&dh);
+
+    } else if (attr_name.compare("displayWindow") == 0) {
+      int x;
+      int y;
+      int w;
+      int h;
+      memcpy(&x, &data.at(0), sizeof(int));
+      memcpy(&y, &data.at(4), sizeof(int));
+      memcpy(&w, &data.at(8), sizeof(int));
+      memcpy(&h, &data.at(12), sizeof(int));
+      tinyexr::swap4(&x);
+      tinyexr::swap4(&y);
+      tinyexr::swap4(&w);
+      tinyexr::swap4(&h);
+    }
+  }
+
+  TINYEXR_CHECK_AND_RETURN_C(dx >= 0, TINYEXR_ERROR_INVALID_DATA);
+  TINYEXR_CHECK_AND_RETURN_C(dy >= 0, TINYEXR_ERROR_INVALID_DATA);
+  TINYEXR_CHECK_AND_RETURN_C(dw >= 0, TINYEXR_ERROR_INVALID_DATA);
+  TINYEXR_CHECK_AND_RETURN_C(dh >= 0, TINYEXR_ERROR_INVALID_DATA);
+  TINYEXR_CHECK_AND_RETURN_C(num_channels >= 1, TINYEXR_ERROR_INVALID_DATA);
+
+  int data_width = dw - dx + 1;
+  int data_height = dh - dy + 1;
+
+  // Read offset tables.
+  int num_blocks = data_height / num_scanline_blocks;
+  if (num_blocks * num_scanline_blocks < data_height) {
+    num_blocks++;
+  }
+
+  std::vector<tinyexr::tinyexr_int64> offsets(static_cast<size_t>(num_blocks));
+
+  for (size_t y = 0; y < static_cast<size_t>(num_blocks); y++) {
+    tinyexr::tinyexr_int64 offset;
+    memcpy(&offset, marker, sizeof(tinyexr::tinyexr_int64));
+    tinyexr::swap8(reinterpret_cast<tinyexr::tinyexr_uint64 *>(&offset));
+    marker += sizeof(tinyexr::tinyexr_int64);  // = 8
+    offsets[y] = offset;
+  }
+
+#if TINYEXR_USE_PIZ
+  if ((compression_type == TINYEXR_COMPRESSIONTYPE_NONE) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_RLE) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_ZIPS) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_PIZ)) {
+#else
+  if ((compression_type == TINYEXR_COMPRESSIONTYPE_NONE) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_RLE) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_ZIPS) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_ZIP)) {
+#endif
+    // OK
+  } else {
+    tinyexr::SetErrorMessage("Unsupported compression format", err);
+    return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+  }
+
+  deep_image->image = static_cast<float ***>(
+      malloc(sizeof(float **) * static_cast<size_t>(num_channels)));
+  for (int c = 0; c < num_channels; c++) {
+    deep_image->image[c] = static_cast<float **>(
+        malloc(sizeof(float *) * static_cast<size_t>(data_height)));
+    for (int y = 0; y < data_height; y++) {
+    }
+  }
+
+  deep_image->offset_table = static_cast<int **>(
+      malloc(sizeof(int *) * static_cast<size_t>(data_height)));
+  for (int y = 0; y < data_height; y++) {
+    deep_image->offset_table[y] = static_cast<int *>(
+        malloc(sizeof(int) * static_cast<size_t>(data_width)));
+  }
+
+  for (size_t y = 0; y < static_cast<size_t>(num_blocks); y++) {
+    const unsigned char *data_ptr =
+        reinterpret_cast<const unsigned char *>(head + offsets[y]);
+
+    // int: y coordinate
+    // int64: packed size of pixel offset table
+    // int64: packed size of sample data
+    // int64: unpacked size of sample data
+    // compressed pixel offset table
+    // compressed sample data
+    int line_no;
+    tinyexr::tinyexr_int64 packedOffsetTableSize;
+    tinyexr::tinyexr_int64 packedSampleDataSize;
+    tinyexr::tinyexr_int64 unpackedSampleDataSize;
+    memcpy(&line_no, data_ptr, sizeof(int));
+    memcpy(&packedOffsetTableSize, data_ptr + 4,
+           sizeof(tinyexr::tinyexr_int64));
+    memcpy(&packedSampleDataSize, data_ptr + 12,
+           sizeof(tinyexr::tinyexr_int64));
+    memcpy(&unpackedSampleDataSize, data_ptr + 20,
+           sizeof(tinyexr::tinyexr_int64));
+
+    tinyexr::swap4(&line_no);
+    tinyexr::swap8(
+        reinterpret_cast<tinyexr::tinyexr_uint64 *>(&packedOffsetTableSize));
+    tinyexr::swap8(
+        reinterpret_cast<tinyexr::tinyexr_uint64 *>(&packedSampleDataSize));
+    tinyexr::swap8(
+        reinterpret_cast<tinyexr::tinyexr_uint64 *>(&unpackedSampleDataSize));
+
+    std::vector<int> pixelOffsetTable(static_cast<size_t>(data_width));
+
+    // decode pixel offset table.
+    {
+      unsigned long dstLen =
+          static_cast<unsigned long>(pixelOffsetTable.size() * sizeof(int));
+      if (!tinyexr::DecompressZip(
+              reinterpret_cast<unsigned char *>(&pixelOffsetTable.at(0)),
+              &dstLen, data_ptr + 28,
+              static_cast<unsigned long>(packedOffsetTableSize))) {
+        return false;
+      }
+
+      TINYEXR_CHECK_AND_RETURN_C(dstLen == pixelOffsetTable.size() * sizeof(int), TINYEXR_ERROR_INVALID_DATA);
+      for (size_t i = 0; i < static_cast<size_t>(data_width); i++) {
+        deep_image->offset_table[y][i] = pixelOffsetTable[i];
+      }
+    }
+
+    std::vector<unsigned char> sample_data(
+        static_cast<size_t>(unpackedSampleDataSize));
+
+    // decode sample data.
+    {
+      unsigned long dstLen = static_cast<unsigned long>(unpackedSampleDataSize);
+      if (dstLen) {
+        if (!tinyexr::DecompressZip(
+                reinterpret_cast<unsigned char *>(&sample_data.at(0)), &dstLen,
+                data_ptr + 28 + packedOffsetTableSize,
+                static_cast<unsigned long>(packedSampleDataSize))) {
+          return false;
+        }
+        TINYEXR_CHECK_AND_RETURN_C(dstLen == static_cast<unsigned long>(unpackedSampleDataSize), TINYEXR_ERROR_INVALID_DATA);
+      }
+    }
+
+    // decode sample
+    int sampleSize = -1;
+    std::vector<int> channel_offset_list(static_cast<size_t>(num_channels));
+    {
+      int channel_offset = 0;
+      for (size_t i = 0; i < static_cast<size_t>(num_channels); i++) {
+        channel_offset_list[i] = channel_offset;
+        if (channels[i].pixel_type == TINYEXR_PIXELTYPE_UINT) {  // UINT
+          channel_offset += 4;
+        } else if (channels[i].pixel_type == TINYEXR_PIXELTYPE_HALF) {  // half
+          channel_offset += 2;
+        } else if (channels[i].pixel_type ==
+                   TINYEXR_PIXELTYPE_FLOAT) {  // float
+          channel_offset += 4;
+        } else {
+          tinyexr::SetErrorMessage("Invalid pixel_type in chnnels.", err);
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+      }
+      sampleSize = channel_offset;
+    }
+    TINYEXR_CHECK_AND_RETURN_C(sampleSize >= 2, TINYEXR_ERROR_INVALID_DATA);
+
+    TINYEXR_CHECK_AND_RETURN_C(static_cast<size_t>(
+               pixelOffsetTable[static_cast<size_t>(data_width - 1)] *
+               sampleSize) == sample_data.size(), TINYEXR_ERROR_INVALID_DATA);
+    int samples_per_line = static_cast<int>(sample_data.size()) / sampleSize;
+
+    //
+    // Alloc memory
+    //
+
+    //
+    // pixel data is stored as image[channels][pixel_samples]
+    //
+    {
+      tinyexr::tinyexr_uint64 data_offset = 0;
+      for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+        deep_image->image[c][y] = static_cast<float *>(
+            malloc(sizeof(float) * static_cast<size_t>(samples_per_line)));
+
+        if (channels[c].pixel_type == 0) {  // UINT
+          for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
+            unsigned int ui;
+            unsigned int *src_ptr = reinterpret_cast<unsigned int *>(
+                &sample_data.at(size_t(data_offset) + x * sizeof(int)));
+            tinyexr::cpy4(&ui, src_ptr);
+            deep_image->image[c][y][x] = static_cast<float>(ui);  // @fixme
+          }
+          data_offset +=
+              sizeof(unsigned int) * static_cast<size_t>(samples_per_line);
+        } else if (channels[c].pixel_type == 1) {  // half
+          for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
+            tinyexr::FP16 f16;
+            const unsigned short *src_ptr = reinterpret_cast<unsigned short *>(
+                &sample_data.at(size_t(data_offset) + x * sizeof(short)));
+            tinyexr::cpy2(&(f16.u), src_ptr);
+            tinyexr::FP32 f32 = half_to_float(f16);
+            deep_image->image[c][y][x] = f32.f;
+          }
+          data_offset += sizeof(short) * static_cast<size_t>(samples_per_line);
+        } else {  // float
+          for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
+            float f;
+            const float *src_ptr = reinterpret_cast<float *>(
+                &sample_data.at(size_t(data_offset) + x * sizeof(float)));
+            tinyexr::cpy4(&f, src_ptr);
+            deep_image->image[c][y][x] = f;
+          }
+          data_offset += sizeof(float) * static_cast<size_t>(samples_per_line);
+        }
+      }
+    }
+  }  // y
+
+  deep_image->width = data_width;
+  deep_image->height = data_height;
+
+  deep_image->channel_names = static_cast<const char **>(
+      malloc(sizeof(const char *) * static_cast<size_t>(num_channels)));
+  for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+#ifdef _WIN32
+    deep_image->channel_names[c] = _strdup(channels[c].name.c_str());
+#else
+    deep_image->channel_names[c] = strdup(channels[c].name.c_str());
+#endif
+  }
+  deep_image->num_channels = num_channels;
+
+  return TINYEXR_SUCCESS;
+}
+
+void InitEXRImage(EXRImage *exr_image) {
+  if (exr_image == NULL) {
+    return;
+  }
+
+  exr_image->width = 0;
+  exr_image->height = 0;
+  exr_image->num_channels = 0;
+
+  exr_image->images = NULL;
+  exr_image->tiles = NULL;
+  exr_image->next_level = NULL;
+  exr_image->level_x = 0;
+  exr_image->level_y = 0;
+
+  exr_image->num_tiles = 0;
+}
+
+void FreeEXRErrorMessage(const char *msg) {
+  if (msg) {
+    free(reinterpret_cast<void *>(const_cast<char *>(msg)));
+  }
+  return;
+}
+
+void InitEXRHeader(EXRHeader *exr_header) {
+  if (exr_header == NULL) {
+    return;
+  }
+
+  memset(exr_header, 0, sizeof(EXRHeader));
+}
+
+int FreeEXRHeader(EXRHeader *exr_header) {
+  if (exr_header == NULL) {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (exr_header->channels) {
+    free(exr_header->channels);
+  }
+
+  if (exr_header->pixel_types) {
+    free(exr_header->pixel_types);
+  }
+
+  if (exr_header->requested_pixel_types) {
+    free(exr_header->requested_pixel_types);
+  }
+
+  for (int i = 0; i < exr_header->num_custom_attributes; i++) {
+    if (exr_header->custom_attributes[i].value) {
+      free(exr_header->custom_attributes[i].value);
+    }
+  }
+
+  if (exr_header->custom_attributes) {
+    free(exr_header->custom_attributes);
+  }
+
+  EXRSetNameAttr(exr_header, NULL);
+
+  return TINYEXR_SUCCESS;
+}
+
+void EXRSetNameAttr(EXRHeader* exr_header, const char* name) {
+  if (exr_header == NULL) {
+    return;
+  }
+  memset(exr_header->name, 0, 256);
+  if (name != NULL) {
+    size_t len = std::min(strlen(name), size_t(255));
+    if (len) {
+      memcpy(exr_header->name, name, len);
+    }
+  }
+}
+
+int EXRNumLevels(const EXRImage* exr_image) {
+  if (exr_image == NULL) return 0;
+  if(exr_image->images) return 1; // scanlines
+  int levels = 1;
+  const EXRImage* level_image = exr_image;
+
+#if 0
+  while ((level_image = level_image->next_level)) 
+      ++levels;
+#else
+  for (; ;)
+  {
+      level_image = level_image->next_level;
+      if (!level_image)
+          break;
+      ++levels;
+  }
+#endif
+
+  return levels;
+}
+
+int FreeEXRImage(EXRImage *exr_image) {
+  if (exr_image == NULL) {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (exr_image->next_level) {
+    FreeEXRImage(exr_image->next_level);
+    delete exr_image->next_level;
+  }
+
+  for (int i = 0; i < exr_image->num_channels; i++) {
+    if (exr_image->images && exr_image->images[i]) {
+      free(exr_image->images[i]);
+    }
+  }
+
+  if (exr_image->images) {
+    free(exr_image->images);
+  }
+
+  if (exr_image->tiles) {
+    for (int tid = 0; tid < exr_image->num_tiles; tid++) {
+      for (int i = 0; i < exr_image->num_channels; i++) {
+        if (exr_image->tiles[tid].images && exr_image->tiles[tid].images[i]) {
+          free(exr_image->tiles[tid].images[i]);
+        }
+      }
+      if (exr_image->tiles[tid].images) {
+        free(exr_image->tiles[tid].images);
+      }
+    }
+    free(exr_image->tiles);
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int ParseEXRHeaderFromFile(EXRHeader *exr_header, const EXRVersion *exr_version,
+                           const char *filename, const char **err) {
+  if (exr_header == NULL || exr_version == NULL || filename == NULL) {
+    tinyexr::SetErrorMessage("Invalid argument for ParseEXRHeaderFromFile",
+                             err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  MemoryMappedFile file(filename);
+  if (!file.valid()) {
+    tinyexr::SetErrorMessage("Cannot read file " + std::string(filename), err);
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  return ParseEXRHeaderFromMemory(exr_header, exr_version, file.data, file.size,
+                                  err);
+}
+
+int ParseEXRMultipartHeaderFromMemory(EXRHeader ***exr_headers,
+                                      int *num_headers,
+                                      const EXRVersion *exr_version,
+                                      const unsigned char *memory, size_t size,
+                                      const char **err) {
+  if (memory == NULL || exr_headers == NULL || num_headers == NULL ||
+      exr_version == NULL) {
+    // Invalid argument
+    tinyexr::SetErrorMessage(
+        "Invalid argument for ParseEXRMultipartHeaderFromMemory", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (size < tinyexr::kEXRVersionSize) {
+    tinyexr::SetErrorMessage("Data size too short", err);
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  const unsigned char *marker = memory + tinyexr::kEXRVersionSize;
+  size_t marker_size = size - tinyexr::kEXRVersionSize;
+
+  std::vector<tinyexr::HeaderInfo> infos;
+
+  for (;;) {
+    tinyexr::HeaderInfo info;
+    info.clear();
+
+    std::string err_str;
+    bool empty_header = false;
+    int ret = ParseEXRHeader(&info, &empty_header, exr_version, &err_str,
+                             marker, marker_size);
+
+    if (ret != TINYEXR_SUCCESS) {
+
+      // Free malloc-allocated memory here.
+      for (size_t i = 0; i < info.attributes.size(); i++) {
+        if (info.attributes[i].value) {
+          free(info.attributes[i].value);
+        }
+      }
+
+      tinyexr::SetErrorMessage(err_str, err);
+      return ret;
+    }
+
+    if (empty_header) {
+      marker += 1;  // skip '\0'
+      break;
+    }
+
+    // `chunkCount` must exist in the header.
+    if (info.chunk_count == 0) {
+
+      // Free malloc-allocated memory here.
+      for (size_t i = 0; i < info.attributes.size(); i++) {
+        if (info.attributes[i].value) {
+          free(info.attributes[i].value);
+        }
+      }
+
+      tinyexr::SetErrorMessage(
+          "`chunkCount' attribute is not found in the header.", err);
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    infos.push_back(info);
+
+    // move to next header.
+    marker += info.header_len;
+    size -= info.header_len;
+  }
+
+  // allocate memory for EXRHeader and create array of EXRHeader pointers.
+  (*exr_headers) =
+      static_cast<EXRHeader **>(malloc(sizeof(EXRHeader *) * infos.size()));
+
+
+  int retcode = TINYEXR_SUCCESS;
+
+  for (size_t i = 0; i < infos.size(); i++) {
+    EXRHeader *exr_header = static_cast<EXRHeader *>(malloc(sizeof(EXRHeader)));
+    memset(exr_header, 0, sizeof(EXRHeader));
+
+    std::string warn;
+    std::string _err;
+    if (!ConvertHeader(exr_header, infos[i], &warn, &_err)) {
+
+      // Free malloc-allocated memory here.
+      for (size_t k = 0; k < infos[i].attributes.size(); k++) {
+        if (infos[i].attributes[k].value) {
+          free(infos[i].attributes[k].value);
+        }
+      }
+
+      if (!_err.empty()) {
+        tinyexr::SetErrorMessage(
+            _err, err);
+      }
+      // continue to converting headers
+      retcode = TINYEXR_ERROR_INVALID_HEADER;
+    }
+
+    exr_header->multipart = exr_version->multipart ? 1 : 0;
+
+    (*exr_headers)[i] = exr_header;
+  }
+
+  (*num_headers) = static_cast<int>(infos.size());
+
+  return retcode;
+}
+
+int ParseEXRMultipartHeaderFromFile(EXRHeader ***exr_headers, int *num_headers,
+                                    const EXRVersion *exr_version,
+                                    const char *filename, const char **err) {
+  if (exr_headers == NULL || num_headers == NULL || exr_version == NULL ||
+      filename == NULL) {
+    tinyexr::SetErrorMessage(
+        "Invalid argument for ParseEXRMultipartHeaderFromFile()", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  MemoryMappedFile file(filename);
+  if (!file.valid()) {
+    tinyexr::SetErrorMessage("Cannot read file " + std::string(filename), err);
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  return ParseEXRMultipartHeaderFromMemory(
+      exr_headers, num_headers, exr_version, file.data, file.size, err);
+}
+
+int ParseEXRVersionFromMemory(EXRVersion *version, const unsigned char *memory,
+                              size_t size) {
+  if (version == NULL || memory == NULL) {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (size < tinyexr::kEXRVersionSize) {
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  const unsigned char *marker = memory;
+
+  // Header check.
+  {
+    const char header[] = {0x76, 0x2f, 0x31, 0x01};
+
+    if (memcmp(marker, header, 4) != 0) {
+      return TINYEXR_ERROR_INVALID_MAGIC_NUMBER;
+    }
+    marker += 4;
+  }
+
+  version->tiled = false;
+  version->long_name = false;
+  version->non_image = false;
+  version->multipart = false;
+
+  // Parse version header.
+  {
+    // must be 2
+    if (marker[0] != 2) {
+      return TINYEXR_ERROR_INVALID_EXR_VERSION;
+    }
+
+    if (version == NULL) {
+      return TINYEXR_SUCCESS;  // May OK
+    }
+
+    version->version = 2;
+
+    if (marker[1] & 0x2) {  // 9th bit
+      version->tiled = true;
+    }
+    if (marker[1] & 0x4) {  // 10th bit
+      version->long_name = true;
+    }
+    if (marker[1] & 0x8) {        // 11th bit
+      version->non_image = true;  // (deep image)
+    }
+    if (marker[1] & 0x10) {  // 12th bit
+      version->multipart = true;
+    }
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int ParseEXRVersionFromFile(EXRVersion *version, const char *filename) {
+  if (filename == NULL) {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  FILE *fp = NULL;
+#ifdef _WIN32
+#if defined(_MSC_VER) || (defined(MINGW_HAS_SECURE_API) && MINGW_HAS_SECURE_API) // MSVC, MinGW GCC, or Clang.
+  errno_t err = _wfopen_s(&fp, tinyexr::UTF8ToWchar(filename).c_str(), L"rb");
+  if (err != 0) {
+    // TODO(syoyo): return wfopen_s erro code
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+#else
+  // Unknown compiler or MinGW without MINGW_HAS_SECURE_API.
+  fp = fopen(filename, "rb");
+#endif
+#else
+  fp = fopen(filename, "rb");
+#endif
+  if (!fp) {
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  // Try to read kEXRVersionSize bytes; if the file is shorter than
+  // kEXRVersionSize, this will produce an error. This avoids a call to
+  // fseek(fp, 0, SEEK_END), which is not required to be supported by C
+  // implementations.
+  unsigned char buf[tinyexr::kEXRVersionSize];
+  size_t ret = fread(&buf[0], 1, tinyexr::kEXRVersionSize, fp);
+  fclose(fp);
+
+  if (ret != tinyexr::kEXRVersionSize) {
+    return TINYEXR_ERROR_INVALID_FILE;
+  }
+
+  return ParseEXRVersionFromMemory(version, buf, tinyexr::kEXRVersionSize);
+}
+
+int LoadEXRMultipartImageFromMemory(EXRImage *exr_images,
+                                    const EXRHeader **exr_headers,
+                                    unsigned int num_parts,
+                                    const unsigned char *memory,
+                                    const size_t size, const char **err) {
+  if (exr_images == NULL || exr_headers == NULL || num_parts == 0 ||
+      memory == NULL || (size <= tinyexr::kEXRVersionSize)) {
+    tinyexr::SetErrorMessage(
+        "Invalid argument for LoadEXRMultipartImageFromMemory()", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  // compute total header size.
+  size_t total_header_size = 0;
+  for (unsigned int i = 0; i < num_parts; i++) {
+    if (exr_headers[i]->header_len == 0) {
+      tinyexr::SetErrorMessage("EXRHeader variable is not initialized.", err);
+      return TINYEXR_ERROR_INVALID_ARGUMENT;
+    }
+
+    total_header_size += exr_headers[i]->header_len;
+  }
+
+  const char *marker = reinterpret_cast<const char *>(
+      memory + total_header_size + 4 +
+      4);  // +8 for magic number and version header.
+
+  marker += 1;  // Skip empty header.
+
+  // NOTE 1:
+  //   In multipart image, There is 'part number' before chunk data.
+  //   4 byte : part number
+  //   4+     : chunk
+  //
+  // NOTE 2:
+  //   EXR spec says 'part number' is 'unsigned long' but actually this is
+  //   'unsigned int(4 bytes)' in OpenEXR implementation...
+  //   http://www.openexr.com/openexrfilelayout.pdf
+
+  // Load chunk offset table.
+  std::vector<tinyexr::OffsetData> chunk_offset_table_list;
+  chunk_offset_table_list.reserve(num_parts);
+  for (size_t i = 0; i < static_cast<size_t>(num_parts); i++) {
+    chunk_offset_table_list.resize(chunk_offset_table_list.size() + 1);
+    tinyexr::OffsetData& offset_data = chunk_offset_table_list.back();
+    if (!exr_headers[i]->tiled || exr_headers[i]->tile_level_mode == TINYEXR_TILE_ONE_LEVEL) {
+      tinyexr::InitSingleResolutionOffsets(offset_data, size_t(exr_headers[i]->chunk_count));
+      std::vector<tinyexr::tinyexr_uint64>& offset_table = offset_data.offsets[0][0];
+
+      for (size_t c = 0; c < offset_table.size(); c++) {
+        tinyexr::tinyexr_uint64 offset;
+        memcpy(&offset, marker, 8);
+        tinyexr::swap8(&offset);
+
+        if (offset >= size) {
+          tinyexr::SetErrorMessage("Invalid offset size in EXR header chunks.",
+                                   err);
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+
+        offset_table[c] = offset + 4;  // +4 to skip 'part number'
+        marker += 8;
+      }
+    } else {
+      {
+        std::vector<int> num_x_tiles, num_y_tiles;
+        if (!tinyexr::PrecalculateTileInfo(num_x_tiles, num_y_tiles, exr_headers[i])) {
+          tinyexr::SetErrorMessage("Invalid tile info.", err);
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+        int num_blocks = InitTileOffsets(offset_data, exr_headers[i], num_x_tiles, num_y_tiles);
+        if (num_blocks != exr_headers[i]->chunk_count) {
+          tinyexr::SetErrorMessage("Invalid offset table size.", err);
+          return TINYEXR_ERROR_INVALID_DATA;
+        }
+      }
+      for (unsigned int l = 0; l < offset_data.offsets.size(); ++l) {
+        for (unsigned int dy = 0; dy < offset_data.offsets[l].size(); ++dy) {
+          for (unsigned int dx = 0; dx < offset_data.offsets[l][dy].size(); ++dx) {
+            tinyexr::tinyexr_uint64 offset;
+            memcpy(&offset, marker, sizeof(tinyexr::tinyexr_uint64));
+            tinyexr::swap8(&offset);
+            if (offset >= size) {
+              tinyexr::SetErrorMessage("Invalid offset size in EXR header chunks.",
+                err);
+              return TINYEXR_ERROR_INVALID_DATA;
+            }
+            offset_data.offsets[l][dy][dx] = offset + 4; // +4 to skip 'part number'
+            marker += sizeof(tinyexr::tinyexr_uint64);  // = 8
+          }
+        }
+      }
+    }
+  }
+
+  // Decode image.
+  for (size_t i = 0; i < static_cast<size_t>(num_parts); i++) {
+    tinyexr::OffsetData &offset_data = chunk_offset_table_list[i];
+
+    // First check 'part number' is identical to 'i'
+    for (unsigned int l = 0; l < offset_data.offsets.size(); ++l)
+      for (unsigned int dy = 0; dy < offset_data.offsets[l].size(); ++dy)
+        for (unsigned int dx = 0; dx < offset_data.offsets[l][dy].size(); ++dx) {
+
+          const unsigned char *part_number_addr =
+              memory + offset_data.offsets[l][dy][dx] - 4;  // -4 to move to 'part number' field.
+          unsigned int part_no;
+          memcpy(&part_no, part_number_addr, sizeof(unsigned int));  // 4
+          tinyexr::swap4(&part_no);
+
+          if (part_no != i) {
+            tinyexr::SetErrorMessage("Invalid `part number' in EXR header chunks.",
+                                     err);
+            return TINYEXR_ERROR_INVALID_DATA;
+          }
+        }
+
+    std::string e;
+    int ret = tinyexr::DecodeChunk(&exr_images[i], exr_headers[i], offset_data,
+                                   memory, size, &e);
+    if (ret != TINYEXR_SUCCESS) {
+      if (!e.empty()) {
+        tinyexr::SetErrorMessage(e, err);
+      }
+      return ret;
+    }
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int LoadEXRMultipartImageFromFile(EXRImage *exr_images,
+                                  const EXRHeader **exr_headers,
+                                  unsigned int num_parts, const char *filename,
+                                  const char **err) {
+  if (exr_images == NULL || exr_headers == NULL || num_parts == 0) {
+    tinyexr::SetErrorMessage(
+        "Invalid argument for LoadEXRMultipartImageFromFile", err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  MemoryMappedFile file(filename);
+  if (!file.valid()) {
+    tinyexr::SetErrorMessage("Cannot read file " + std::string(filename), err);
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  return LoadEXRMultipartImageFromMemory(exr_images, exr_headers, num_parts,
+                                         file.data, file.size, err);
+}
+
+int SaveEXRToMemory(const float *data, int width, int height, int components,
+            const int save_as_fp16, const unsigned char **outbuf, const char **err) {
+
+  if ((components == 1) || components == 3 || components == 4) {
+    // OK
+  } else {
+    std::stringstream ss;
+    ss << "Unsupported component value : " << components << std::endl;
+
+    tinyexr::SetErrorMessage(ss.str(), err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  EXRHeader header;
+  InitEXRHeader(&header);
+
+  if ((width < 16) && (height < 16)) {
+    // No compression for small image.
+    header.compression_type = TINYEXR_COMPRESSIONTYPE_NONE;
+  } else {
+    header.compression_type = TINYEXR_COMPRESSIONTYPE_ZIP;
+  }
+
+  EXRImage image;
+  InitEXRImage(&image);
+
+  image.num_channels = components;
+
+  std::vector<float> images[4];
+
+  if (components == 1) {
+    images[0].resize(static_cast<size_t>(width * height));
+    memcpy(images[0].data(), data, sizeof(float) * size_t(width * height));
+  } else {
+    images[0].resize(static_cast<size_t>(width * height));
+    images[1].resize(static_cast<size_t>(width * height));
+    images[2].resize(static_cast<size_t>(width * height));
+    images[3].resize(static_cast<size_t>(width * height));
+
+    // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
+    for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
+      images[0][i] = data[static_cast<size_t>(components) * i + 0];
+      images[1][i] = data[static_cast<size_t>(components) * i + 1];
+      images[2][i] = data[static_cast<size_t>(components) * i + 2];
+      if (components == 4) {
+        images[3][i] = data[static_cast<size_t>(components) * i + 3];
+      }
+    }
+  }
+
+  float *image_ptr[4] = {0, 0, 0, 0};
+  if (components == 4) {
+    image_ptr[0] = &(images[3].at(0));  // A
+    image_ptr[1] = &(images[2].at(0));  // B
+    image_ptr[2] = &(images[1].at(0));  // G
+    image_ptr[3] = &(images[0].at(0));  // R
+  } else if (components == 3) {
+    image_ptr[0] = &(images[2].at(0));  // B
+    image_ptr[1] = &(images[1].at(0));  // G
+    image_ptr[2] = &(images[0].at(0));  // R
+  } else if (components == 1) {
+    image_ptr[0] = &(images[0].at(0));  // A
+  }
+
+  image.images = reinterpret_cast<unsigned char **>(image_ptr);
+  image.width = width;
+  image.height = height;
+
+  header.num_channels = components;
+  header.channels = static_cast<EXRChannelInfo *>(malloc(
+      sizeof(EXRChannelInfo) * static_cast<size_t>(header.num_channels)));
+  // Must be (A)BGR order, since most of EXR viewers expect this channel order.
+  if (components == 4) {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "A", 255);
+    strncpy_s(header.channels[1].name, "B", 255);
+    strncpy_s(header.channels[2].name, "G", 255);
+    strncpy_s(header.channels[3].name, "R", 255);
+#else
+    strncpy(header.channels[0].name, "A", 255);
+    strncpy(header.channels[1].name, "B", 255);
+    strncpy(header.channels[2].name, "G", 255);
+    strncpy(header.channels[3].name, "R", 255);
+#endif
+    header.channels[0].name[strlen("A")] = '\0';
+    header.channels[1].name[strlen("B")] = '\0';
+    header.channels[2].name[strlen("G")] = '\0';
+    header.channels[3].name[strlen("R")] = '\0';
+  } else if (components == 3) {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "B", 255);
+    strncpy_s(header.channels[1].name, "G", 255);
+    strncpy_s(header.channels[2].name, "R", 255);
+#else
+    strncpy(header.channels[0].name, "B", 255);
+    strncpy(header.channels[1].name, "G", 255);
+    strncpy(header.channels[2].name, "R", 255);
+#endif
+    header.channels[0].name[strlen("B")] = '\0';
+    header.channels[1].name[strlen("G")] = '\0';
+    header.channels[2].name[strlen("R")] = '\0';
+  } else {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "A", 255);
+#else
+    strncpy(header.channels[0].name, "A", 255);
+#endif
+    header.channels[0].name[strlen("A")] = '\0';
+  }
+
+  header.pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(header.num_channels)));
+  header.requested_pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(header.num_channels)));
+  for (int i = 0; i < header.num_channels; i++) {
+    header.pixel_types[i] =
+        TINYEXR_PIXELTYPE_FLOAT;  // pixel type of input image
+
+    if (save_as_fp16 > 0) {
+      header.requested_pixel_types[i] =
+          TINYEXR_PIXELTYPE_HALF;  // save with half(fp16) pixel format
+    } else {
+      header.requested_pixel_types[i] =
+          TINYEXR_PIXELTYPE_FLOAT;  // save with float(fp32) pixel format(i.e.
+                                    // no precision reduction)
+    }
+  }
+
+
+  unsigned char *mem_buf;
+  size_t mem_size = SaveEXRImageToMemory(&image, &header, &mem_buf, err);
+
+  if (mem_size == 0) {
+    return TINYEXR_ERROR_SERIALIZATION_FAILED;
+  }
+
+  free(header.channels);
+  free(header.pixel_types);
+  free(header.requested_pixel_types);
+
+  if (mem_size > size_t(std::numeric_limits<int>::max())) {
+    free(mem_buf);
+    return TINYEXR_ERROR_DATA_TOO_LARGE;
+  }
+
+  (*outbuf) = mem_buf;
+
+  return int(mem_size);
+}
+
+int SaveEXR(const float *data, int width, int height, int components,
+            const int save_as_fp16, const char *outfilename, const char **err) {
+  if ((components == 1) || components == 3 || components == 4) {
+    // OK
+  } else {
+    std::stringstream ss;
+    ss << "Unsupported component value : " << components << std::endl;
+
+    tinyexr::SetErrorMessage(ss.str(), err);
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  EXRHeader header;
+  InitEXRHeader(&header);
+
+  if ((width < 16) && (height < 16)) {
+    // No compression for small image.
+    header.compression_type = TINYEXR_COMPRESSIONTYPE_NONE;
+  } else {
+    header.compression_type = TINYEXR_COMPRESSIONTYPE_ZIP;
+  }
+
+  EXRImage image;
+  InitEXRImage(&image);
+
+  image.num_channels = components;
+
+  std::vector<float> images[4];
+  const size_t pixel_count =
+      static_cast<size_t>(width) * static_cast<size_t>(height);
+
+  if (components == 1) {
+    images[0].resize(pixel_count);
+    memcpy(images[0].data(), data, sizeof(float) * pixel_count);
+  } else {
+    images[0].resize(pixel_count);
+    images[1].resize(pixel_count);
+    images[2].resize(pixel_count);
+    images[3].resize(pixel_count);
+
+    // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
+    for (size_t i = 0; i < pixel_count; i++) {
+      images[0][i] = data[static_cast<size_t>(components) * i + 0];
+      images[1][i] = data[static_cast<size_t>(components) * i + 1];
+      images[2][i] = data[static_cast<size_t>(components) * i + 2];
+      if (components == 4) {
+        images[3][i] = data[static_cast<size_t>(components) * i + 3];
+      }
+    }
+  }
+
+  float *image_ptr[4] = {0, 0, 0, 0};
+  if (components == 4) {
+    image_ptr[0] = &(images[3].at(0));  // A
+    image_ptr[1] = &(images[2].at(0));  // B
+    image_ptr[2] = &(images[1].at(0));  // G
+    image_ptr[3] = &(images[0].at(0));  // R
+  } else if (components == 3) {
+    image_ptr[0] = &(images[2].at(0));  // B
+    image_ptr[1] = &(images[1].at(0));  // G
+    image_ptr[2] = &(images[0].at(0));  // R
+  } else if (components == 1) {
+    image_ptr[0] = &(images[0].at(0));  // A
+  }
+
+  image.images = reinterpret_cast<unsigned char **>(image_ptr);
+  image.width = width;
+  image.height = height;
+
+  header.num_channels = components;
+  header.channels = static_cast<EXRChannelInfo *>(malloc(
+      sizeof(EXRChannelInfo) * static_cast<size_t>(header.num_channels)));
+  // Must be (A)BGR order, since most of EXR viewers expect this channel order.
+  if (components == 4) {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "A", 255);
+    strncpy_s(header.channels[1].name, "B", 255);
+    strncpy_s(header.channels[2].name, "G", 255);
+    strncpy_s(header.channels[3].name, "R", 255);
+#else
+    strncpy(header.channels[0].name, "A", 255);
+    strncpy(header.channels[1].name, "B", 255);
+    strncpy(header.channels[2].name, "G", 255);
+    strncpy(header.channels[3].name, "R", 255);
+#endif
+    header.channels[0].name[strlen("A")] = '\0';
+    header.channels[1].name[strlen("B")] = '\0';
+    header.channels[2].name[strlen("G")] = '\0';
+    header.channels[3].name[strlen("R")] = '\0';
+  } else if (components == 3) {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "B", 255);
+    strncpy_s(header.channels[1].name, "G", 255);
+    strncpy_s(header.channels[2].name, "R", 255);
+#else
+    strncpy(header.channels[0].name, "B", 255);
+    strncpy(header.channels[1].name, "G", 255);
+    strncpy(header.channels[2].name, "R", 255);
+#endif
+    header.channels[0].name[strlen("B")] = '\0';
+    header.channels[1].name[strlen("G")] = '\0';
+    header.channels[2].name[strlen("R")] = '\0';
+  } else {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "A", 255);
+#else
+    strncpy(header.channels[0].name, "A", 255);
+#endif
+    header.channels[0].name[strlen("A")] = '\0';
+  }
+
+  header.pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(header.num_channels)));
+  header.requested_pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(header.num_channels)));
+  for (int i = 0; i < header.num_channels; i++) {
+    header.pixel_types[i] =
+        TINYEXR_PIXELTYPE_FLOAT;  // pixel type of input image
+
+    if (save_as_fp16 > 0) {
+      header.requested_pixel_types[i] =
+          TINYEXR_PIXELTYPE_HALF;  // save with half(fp16) pixel format
+    } else {
+      header.requested_pixel_types[i] =
+          TINYEXR_PIXELTYPE_FLOAT;  // save with float(fp32) pixel format(i.e.
+                                    // no precision reduction)
+    }
+  }
+
+  int ret = SaveEXRImageToFile(&image, &header, outfilename, err);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  free(header.channels);
+  free(header.pixel_types);
+  free(header.requested_pixel_types);
+
+  return ret;
+}
+
+#ifdef __clang__
+// zero-as-null-pointer-constant
+#pragma clang diagnostic pop
+#endif
+
+#endif  // TINYEXR_IMPLEMENTATION_DEFINED
+#endif  // TINYEXR_IMPLEMENTATION