std/jpeg: avoid AVX2 upsample_inv_h2v2 for GCC

name                                             old speed      new speed      delta

wuffs_jpeg_decode_19k_8bpp/clang11                369MB/s ± 0%   368MB/s ± 0%     ~     (p=1.000 n=5+5)
wuffs_jpeg_decode_30k_24bpp_progressive/clang11   332MB/s ± 0%   333MB/s ± 0%   +0.46%  (p=0.008 n=5+5)
wuffs_jpeg_decode_30k_24bpp_sequential/clang11   1.01GB/s ± 0%  1.01GB/s ± 0%     ~     (p=1.000 n=5+5)
wuffs_jpeg_decode_77k_24bpp/clang11               829MB/s ± 0%   829MB/s ± 0%     ~     (p=0.905 n=4+5)
wuffs_jpeg_decode_552k_24bpp_420/clang11          902MB/s ± 0%   902MB/s ± 0%     ~     (p=0.548 n=5+5)
wuffs_jpeg_decode_552k_24bpp_444/clang11          649MB/s ± 0%   649MB/s ± 0%     ~     (p=0.841 n=5+5)
wuffs_jpeg_decode_4002k_24bpp/clang11             904MB/s ± 0%   905MB/s ± 0%   +0.14%  (p=0.008 n=5+5)

wuffs_jpeg_decode_19k_8bpp/gcc10                  322MB/s ± 0%   362MB/s ± 0%  +12.60%  (p=0.008 n=5+5)
wuffs_jpeg_decode_30k_24bpp_progressive/gcc10     284MB/s ± 0%   328MB/s ± 0%  +15.63%  (p=0.008 n=5+5)
wuffs_jpeg_decode_30k_24bpp_sequential/gcc10      908MB/s ± 0%   926MB/s ± 0%   +2.04%  (p=0.008 n=5+5)
wuffs_jpeg_decode_77k_24bpp/gcc10                 730MB/s ± 0%   757MB/s ± 0%   +3.76%  (p=0.008 n=5+5)
wuffs_jpeg_decode_552k_24bpp_420/gcc10            785MB/s ± 0%   835MB/s ± 0%   +6.40%  (p=0.008 n=5+5)
wuffs_jpeg_decode_552k_24bpp_444/gcc10            625MB/s ± 0%   655MB/s ± 0%   +4.76%  (p=0.016 n=4+5)
wuffs_jpeg_decode_4002k_24bpp/gcc10               783MB/s ± 0%   843MB/s ± 0%   +7.57%  (p=0.016 n=5+4)
diff --git a/internal/cgen/base/pixconv-submodule-ycck.c b/internal/cgen/base/pixconv-submodule-ycck.c
index adf8d24..936bb51 100644
--- a/internal/cgen/base/pixconv-submodule-ycck.c
+++ b/internal/cgen/base/pixconv-submodule-ycck.c
@@ -1049,11 +1049,18 @@
        wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h2, inv_v2))) {
     func = &wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter;
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__) && !defined(__clang__)
+    // Don't use our AVX2 implementation for GCC (but do use it for clang). For
+    // some unknown reason, GCC performs noticably better on the non-SIMD
+    // version. Possibly because GCC's auto-vectorizer is smarter (just with
+    // SSE2, not AVX2) than our hand-written code, but that's just a guess.
+#else
     if (wuffs_base__cpu_arch__have_x86_avx2()) {
       upfuncs[1][1] =
           wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v2_triangle_x86_avx2;
     }
 #endif
+#endif
 
   } else {
     switch (dst->pixcfg.private_impl.pixfmt.repr) {
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 61e2ab3..ecbef92 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -24871,11 +24871,18 @@
        wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h2, inv_v2))) {
     func = &wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter;
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__) && !defined(__clang__)
+    // Don't use our AVX2 implementation for GCC (but do use it for clang). For
+    // some unknown reason, GCC performs noticably better on the non-SIMD
+    // version. Possibly because GCC's auto-vectorizer is smarter (just with
+    // SSE2, not AVX2) than our hand-written code, but that's just a guess.
+#else
     if (wuffs_base__cpu_arch__have_x86_avx2()) {
       upfuncs[1][1] =
           wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v2_triangle_x86_avx2;
     }
 #endif
+#endif
 
   } else {
     switch (dst->pixcfg.private_impl.pixfmt.repr) {