Add png.decoder.filter_3_distance_4_sse128
wuffs_png_decode_filt_3_dist_3/clang9 905MB/s ± 0% 913MB/s ± 0% +0.81% (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_4/clang9 944MB/s ± 1% 1780MB/s ± 0% +88.51% (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_3/gcc10 1.07GB/s ± 0% 1.08GB/s ± 0% +0.42% (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_4/gcc10 1.01GB/s ± 1% 2.11GB/s ± 0% +109.79% (p=0.008 n=5+5)
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 9965cff..b7e4947 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -455,15 +455,36 @@
return nil
}
- b.printf("%s(", method.Str(g.tm))
- if err := g.writeExpr(b, recv, depth); err != nil {
- return err
- }
- for _, o := range args {
- b.writes(", ")
- if err := g.writeExpr(b, o.AsArg().Value(), depth); err != nil {
+ const create = "create"
+ methodStr := method.Str(g.tm)
+ if strings.HasPrefix(methodStr, create) {
+ b.printf("%s(", methodStr[len(create):])
+ for i, o := range args {
+ if i > 0 {
+ b.writes(", ")
+ }
+ after := ""
+ switch method {
+ case t.IDCreateMMSet1EPI8:
+ b.writes("(char)(")
+ after = ")"
+ }
+ if err := g.writeExpr(b, o.AsArg().Value(), depth); err != nil {
+ return err
+ }
+ b.writes(after)
+ }
+ } else {
+ b.printf("%s(", methodStr)
+ if err := g.writeExpr(b, recv, depth); err != nil {
return err
}
+ for _, o := range args {
+ b.writes(", ")
+ if err := g.writeExpr(b, o.AsArg().Value(), depth); err != nil {
+ return err
+ }
+ }
}
b.writes(")")
return nil
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index b7d713a..60d804b 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -536,7 +536,14 @@
"sse128_i.truncate_u32() u32",
// TODO: generate these methods automatically?
- "sse128_i._mm_add_epi8!(b: sse128_i) sse128_i",
+
+ "sse128_i.create_mm_set1_epi8(a: u8) sse128_i",
+
+ "sse128_i._mm_add_epi8(b: sse128_i) sse128_i",
+ "sse128_i._mm_and_si128(b: sse128_i) sse128_i",
+ "sse128_i._mm_avg_epu8(b: sse128_i) sse128_i",
+ "sse128_i._mm_sub_epi8(b: sse128_i) sse128_i",
+ "sse128_i._mm_xor_si128(b: sse128_i) sse128_i",
}
var Interfaces = []string{
diff --git a/lang/token/list.go b/lang/token/list.go
index 24ad4a9..ea3af92 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -678,6 +678,8 @@
IDStoreSlice = ID(0x388)
IDTruncateU32 = ID(0x389)
IDTruncateU64 = ID(0x38A)
+
+ IDCreateMMSet1EPI8 = ID(0x390)
)
var builtInsByID = [nBuiltInIDs]string{
@@ -1072,6 +1074,8 @@
IDStoreSlice: "store_slice",
IDTruncateU32: "truncate_u32",
IDTruncateU64: "truncate_u64",
+
+ IDCreateMMSet1EPI8: "create_mm_set1_epi8",
}
var builtInsByName = map[string]ID{}
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 6430220..d6ed9d9 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30298,6 +30298,14 @@
wuffs_base__slice_u8 a_curr);
#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_sse128(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev);
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
static wuffs_base__empty_struct
wuffs_png__decoder__choose_filter_implementations(
wuffs_png__decoder* self);
@@ -31073,6 +31081,69 @@
}
#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+// -------- func png.decoder.filter_3_distance_4_sse128
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_sse128(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev) {
+ wuffs_base__slice_u8 v_c = {0};
+ wuffs_base__slice_u8 v_p = {0};
+ __m128i v_x128 = {0};
+ __m128i v_a128 = {0};
+ __m128i v_b128 = {0};
+ __m128i v_p128 = {0};
+ __m128i v_k128 = {0};
+
+ if (((uint64_t)(a_prev.len)) == 0) {
+ v_k128 = _mm_set1_epi8((char)(254));
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c = i_slice_c;
+ v_c.len = 4;
+ uint8_t* i_end0_c = i_slice_c.ptr + ((i_slice_c.len / 4) * 4);
+ while (v_c.ptr < i_end0_c) {
+ v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
+ (v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr))), wuffs_base__make_empty_struct());
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ (wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128)))), wuffs_base__make_empty_struct());
+ v_c.ptr += 4;
+ }
+ }
+ } else {
+ v_k128 = _mm_set1_epi8((char)(1));
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c = i_slice_c;
+ wuffs_base__slice_u8 i_slice_p = a_prev;
+ v_p = i_slice_p;
+ i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end0_c = i_slice_c.ptr + ((i_slice_c.len / 4) * 4);
+ while (v_c.ptr < i_end0_c) {
+ (v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr))), wuffs_base__make_empty_struct());
+ v_p128 = _mm_avg_epu8(v_a128, v_b128);
+ v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
+ (v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr))), wuffs_base__make_empty_struct());
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ (wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128)))), wuffs_base__make_empty_struct());
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ }
+ }
+ return wuffs_base__make_empty_struct();
+}
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
// -------- func png.decoder.set_quirk_enabled
WUFFS_BASE__MAYBE_STATIC wuffs_base__empty_struct
@@ -31569,6 +31640,9 @@
#endif
&wuffs_png__decoder__filter_1_distance_4_fallback);
self->private_impl.choosy_filter_3 = (
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ wuffs_base__cpu_arch__have_sse128() ? &wuffs_png__decoder__filter_3_distance_4_sse128 :
+#endif
&wuffs_png__decoder__filter_3_distance_4_fallback);
self->private_impl.choosy_filter_4 = (
&wuffs_png__decoder__filter_4_distance_4_fallback);
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
index 7858541..ac18984 100644
--- a/std/png/decode_filter_sse128.wuffs
+++ b/std/png/decode_filter_sse128.wuffs
@@ -47,8 +47,74 @@
iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
x128.load_u32!(a: c.peek_u32le())
- x128 = x128._mm_add_epi8!(b: a128)
+ x128 = x128._mm_add_epi8(b: a128)
a128 = x128
c.poke_u32le!(a: x128.truncate_u32())
}
}
+
+// --------
+
+// Filter 3: Average.
+
+// Similar to filter_1_distance_3_sse128, the SIMD implementation for (filter =
+// 3, distance = 3) doesn't actually bench faster than the non-SIMD one.
+//
+// pri func decoder.filter_3_distance_3_sse128!(curr: slice base.u8, prev: slice base.u8),
+// choose cpu_arch >= sse128,
+// {
+// etc
+// }
+
+pri func decoder.filter_3_distance_4_sse128!(curr: slice base.u8, prev: slice base.u8),
+ choose cpu_arch >= sse128,
+{
+ var c : slice base.u8
+ var p : slice base.u8
+ var x128 : base.sse128_i
+ var a128 : base.sse128_i
+ var b128 : base.sse128_i
+ var p128 : base.sse128_i
+ var k128 : base.sse128_i
+
+ if args.prev.length() == 0 {
+ k128 = k128.create_mm_set1_epi8(a: 0xFE)
+ iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+ // The predictor, p128, is just half (rounded down) of the previous
+ // pixel, a128. In this branch, b128 stays zero so the average of
+ // a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but
+ // (a128 & 0xFE_repeated) takes out the low bits of a128's bytes.
+ p128 = a128._mm_and_si128(b: k128)._mm_avg_epu8(b: b128)
+
+ // Add the predictor to the residual and, for the next iteration,
+ // set its previous pixel, a128, to this one, x128.
+ x128.load_u32!(a: c.peek_u32le())
+ x128 = x128._mm_add_epi8(b: p128)
+ a128 = x128
+ c.poke_u32le!(a: x128.truncate_u32())
+ }
+
+ } else {
+ k128 = k128.create_mm_set1_epi8(a: 0x01)
+ iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) {
+ // Load the pixel from the row above.
+ b128.load_u32!(a: p.peek_u32le())
+
+ // The predictor, p128, is the average (rounded down) of the
+ // previous pixel, a128, and the pixel above, b128.
+ p128 = a128._mm_avg_epu8(b: b128)
+
+ // Subtract a correction term because _mm_avg_epu8 rounds up but
+ // the PNG filter rounds down. The correction term is the low bit
+ // of each byte of (a128 ^ b128).
+ p128 = p128._mm_sub_epi8(b: k128._mm_and_si128(b: a128._mm_xor_si128(b: b128)))
+
+ // Add the predictor to the residual and, for the next iteration,
+ // set its previous pixel, a128, to this one, x128.
+ x128.load_u32!(a: c.peek_u32le())
+ x128 = x128._mm_add_epi8(b: p128)
+ a128 = x128
+ c.poke_u32le!(a: x128.truncate_u32())
+ }
+ }
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index 45c8746..0dc152e 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -256,7 +256,9 @@
choose filter_1 = [
filter_1_distance_4_sse128,
filter_1_distance_4_fallback]
- choose filter_3 = [filter_3_distance_4_fallback]
+ choose filter_3 = [
+ filter_3_distance_4_sse128,
+ filter_3_distance_4_fallback]
choose filter_4 = [filter_4_distance_4_fallback]
}
}