New files
diff --git a/encoder/basisu_kernels_declares.h b/encoder/basisu_kernels_declares.h
new file mode 100644
index 0000000..2872604
--- /dev/null
+++ b/encoder/basisu_kernels_declares.h
@@ -0,0 +1,25 @@
+// basisu_kernels_declares.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if BASISU_SUPPORT_SSE
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+#endif
\ No newline at end of file
diff --git a/encoder/basisu_kernels_imp.h b/encoder/basisu_kernels_imp.h
new file mode 100644
index 0000000..0468805
--- /dev/null
+++ b/encoder/basisu_kernels_imp.h
@@ -0,0 +1,584 @@
+// basisu_kernels_imp.h - Do not directly include
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using namespace CPPSPMD;
+
+namespace CPPSPMD_NAME(basisu_kernels_namespace)
+{
+   struct perceptual_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint delta_l = dr * 27 + dg * 92 + db * 9;
+            vint delta_cr = dr * 128 - delta_l;
+            vint delta_cb = db * 128 - delta_l;
+
+            vint id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int delta_l = dr * 27 + dg * 92 + db * 9;
+            int delta_cr = dr * 128 - delta_l;
+            int delta_cb = db * 128 - delta_l;
+
+            int id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct linear_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint id = dr * dr + dg * dg + db * db;
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int id = dr * dr + dg * dg + db * db;
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n,
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+} // namespace
+
+using namespace CPPSPMD_NAME(basisu_kernels_namespace);
+
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+