std/jpeg: avoid AVX2 upsample_inv_h2v2 for GCC
name old speed new speed delta
wuffs_jpeg_decode_19k_8bpp/clang11 369MB/s ± 0% 368MB/s ± 0% ~ (p=1.000 n=5+5)
wuffs_jpeg_decode_30k_24bpp_progressive/clang11 332MB/s ± 0% 333MB/s ± 0% +0.46% (p=0.008 n=5+5)
wuffs_jpeg_decode_30k_24bpp_sequential/clang11 1.01GB/s ± 0% 1.01GB/s ± 0% ~ (p=1.000 n=5+5)
wuffs_jpeg_decode_77k_24bpp/clang11 829MB/s ± 0% 829MB/s ± 0% ~ (p=0.905 n=4+5)
wuffs_jpeg_decode_552k_24bpp_420/clang11 902MB/s ± 0% 902MB/s ± 0% ~ (p=0.548 n=5+5)
wuffs_jpeg_decode_552k_24bpp_444/clang11 649MB/s ± 0% 649MB/s ± 0% ~ (p=0.841 n=5+5)
wuffs_jpeg_decode_4002k_24bpp/clang11 904MB/s ± 0% 905MB/s ± 0% +0.14% (p=0.008 n=5+5)
wuffs_jpeg_decode_19k_8bpp/gcc10 322MB/s ± 0% 362MB/s ± 0% +12.60% (p=0.008 n=5+5)
wuffs_jpeg_decode_30k_24bpp_progressive/gcc10 284MB/s ± 0% 328MB/s ± 0% +15.63% (p=0.008 n=5+5)
wuffs_jpeg_decode_30k_24bpp_sequential/gcc10 908MB/s ± 0% 926MB/s ± 0% +2.04% (p=0.008 n=5+5)
wuffs_jpeg_decode_77k_24bpp/gcc10 730MB/s ± 0% 757MB/s ± 0% +3.76% (p=0.008 n=5+5)
wuffs_jpeg_decode_552k_24bpp_420/gcc10 785MB/s ± 0% 835MB/s ± 0% +6.40% (p=0.008 n=5+5)
wuffs_jpeg_decode_552k_24bpp_444/gcc10 625MB/s ± 0% 655MB/s ± 0% +4.76% (p=0.016 n=4+5)
wuffs_jpeg_decode_4002k_24bpp/gcc10 783MB/s ± 0% 843MB/s ± 0% +7.57% (p=0.016 n=5+4)
diff --git a/internal/cgen/base/pixconv-submodule-ycck.c b/internal/cgen/base/pixconv-submodule-ycck.c
index adf8d24..936bb51 100644
--- a/internal/cgen/base/pixconv-submodule-ycck.c
+++ b/internal/cgen/base/pixconv-submodule-ycck.c
@@ -1049,11 +1049,18 @@
wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h2, inv_v2))) {
func = &wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter;
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__) && !defined(__clang__)
+ // Don't use our AVX2 implementation for GCC (but do use it for clang). For
+ // some unknown reason, GCC performs noticably better on the non-SIMD
+ // version. Possibly because GCC's auto-vectorizer is smarter (just with
+ // SSE2, not AVX2) than our hand-written code, but that's just a guess.
+#else
if (wuffs_base__cpu_arch__have_x86_avx2()) {
upfuncs[1][1] =
wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v2_triangle_x86_avx2;
}
#endif
+#endif
} else {
switch (dst->pixcfg.private_impl.pixfmt.repr) {
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 61e2ab3..ecbef92 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -24871,11 +24871,18 @@
wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h2, inv_v2))) {
func = &wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter;
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__) && !defined(__clang__)
+ // Don't use our AVX2 implementation for GCC (but do use it for clang). For
+ // some unknown reason, GCC performs noticably better on the non-SIMD
+ // version. Possibly because GCC's auto-vectorizer is smarter (just with
+ // SSE2, not AVX2) than our hand-written code, but that's just a guess.
+#else
if (wuffs_base__cpu_arch__have_x86_avx2()) {
upfuncs[1][1] =
wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v2_triangle_x86_avx2;
}
#endif
+#endif
} else {
switch (dst->pixcfg.private_impl.pixfmt.repr) {