Add std/png bench comment for the roll back
diff --git a/std/png/decode_filter_sse42.wuffs b/std/png/decode_filter_sse42.wuffs
index e96a304..7c19f8d 100644
--- a/std/png/decode_filter_sse42.wuffs
+++ b/std/png/decode_filter_sse42.wuffs
@@ -37,6 +37,10 @@
 //         c.poke_u24le!(a: x128.truncate_u32())
 //     }
 // }
+//
+// Note that "more SIMD" doesn't always mean faster compute. See
+// https://github.com/google/wuffs/commit/1660f9268621ed4415b3b363f0a0e1026d4aa83d
+// "Have std/png filter_1_distance_? use more SIMD" for a pessimizing example.
 
 pri func decoder.filter_1_distance_4_sse42!(curr: slice base.u8),
 	choose cpu_arch >= x86_sse42,