Let iterate have multiple assigns
wuffs_png_decode_19k_8bpp/clang9 92.2MB/s ± 0% 93.4MB/s ± 0% +1.33% (p=0.008 n=5+5)
wuffs_png_decode_40k_24bpp/clang9 116MB/s ± 0% 115MB/s ± 0% ~ (p=1.000 n=5+5)
wuffs_png_decode_77k_8bpp/clang9 347MB/s ± 0% 351MB/s ± 0% +1.19% (p=0.016 n=5+4)
wuffs_png_decode_552k_32bpp/clang9 150MB/s ± 0% 156MB/s ± 0% +3.57% (p=0.008 n=5+5)
wuffs_png_decode_4002k_24bpp/clang9 117MB/s ± 1% 118MB/s ± 0% +0.62% (p=0.008 n=5+5)
wuffs_png_decode_filter_1_sub/clang9 1.34GB/s ± 0% 1.24GB/s ± 0% -7.30% (p=0.008 n=5+5)
wuffs_png_decode_filter_2_up/clang9 13.3GB/s ± 0% 13.4GB/s ± 0% +0.96% (p=0.008 n=5+5)
wuffs_png_decode_filter_3_average/clang9 754MB/s ± 0% 948MB/s ± 0% +25.62% (p=0.008 n=5+5)
wuffs_png_decode_filter_4_paeth/clang9 127MB/s ± 0% 138MB/s ± 0% +9.01% (p=0.008 n=5+5)
wuffs_png_decode_19k_8bpp/gcc10 100MB/s ± 1% 100MB/s ± 0% ~ (p=0.690 n=5+5)
wuffs_png_decode_40k_24bpp/gcc10 114MB/s ± 0% 118MB/s ± 0% +3.21% (p=0.008 n=5+5)
wuffs_png_decode_77k_8bpp/gcc10 323MB/s ± 2% 325MB/s ± 0% ~ (p=0.841 n=5+5)
wuffs_png_decode_552k_32bpp/gcc10 159MB/s ± 0% 172MB/s ± 0% +8.38% (p=0.008 n=5+5)
wuffs_png_decode_4002k_24bpp/gcc10 118MB/s ± 0% 118MB/s ± 0% ~ (p=0.056 n=5+5)
wuffs_png_decode_filter_1_sub/gcc10 1.84GB/s ± 1% 1.85GB/s ± 0% +0.22% (p=0.008 n=5+5)
wuffs_png_decode_filter_2_up/gcc10 10.5GB/s ± 0% 10.5GB/s ± 0% ~ (p=0.310 n=5+5)
wuffs_png_decode_filter_3_average/gcc10 743MB/s ± 0% 1008MB/s ± 0% +35.66% (p=0.008 n=5+5)
wuffs_png_decode_filter_4_paeth/gcc10 83.1MB/s ± 0% 94.1MB/s ± 0% +13.15% (p=0.008 n=5+5)
diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index 170b3be..ca23e0d 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go
@@ -396,22 +396,26 @@
if len(assigns) == 0 {
return nil
}
- if len(assigns) != 1 {
- return fmt.Errorf("TODO: iterate over more than one assign")
- }
- o := assigns[0].AsAssign()
- name := o.LHS().Ident().Str(g.tm)
+ name0 := assigns[0].AsAssign().LHS().Ident().Str(g.tm)
b.writes("{\n")
// TODO: don't assume that the slice is a slice of base.u8. In
// particular, the code gen can be subtle if the slice element type has
// zero size, such as the empty struct.
- b.printf("wuffs_base__slice_u8 %sslice_%s = ", iPrefix, name)
- if err := g.writeExpr(b, o.RHS(), 0); err != nil {
- return err
+ for i, o := range assigns {
+ o := o.AsAssign()
+ name := o.LHS().Ident().Str(g.tm)
+ b.printf("wuffs_base__slice_u8 %sslice_%s = ", iPrefix, name)
+ if err := g.writeExpr(b, o.RHS(), 0); err != nil {
+ return err
+ }
+ b.writes(";\n")
+ b.printf("%s%s = %sslice_%s;\n", vPrefix, name, iPrefix, name)
+ if i > 0 {
+ b.printf("%sslice_%s.len = wuffs_base__u64__min(%sslice_%s.len, %sslice_%s.len);\n",
+ iPrefix, name0, iPrefix, name0, iPrefix, name)
+ }
}
- b.writes(";\n")
- b.printf("%s%s = %sslice_%s;\n", vPrefix, name, iPrefix, name)
// TODO: look at n.HasContinue() and n.HasBreak().
round := uint32(0)
@@ -425,7 +429,7 @@
return err
}
for {
- if err := g.writeIterateRound(b, name, n.Body(), round, depth, length, unroll); err != nil {
+ if err := g.writeIterateRound(b, assigns, n.Body(), round, depth, length, unroll); err != nil {
return err
}
round++
@@ -566,18 +570,25 @@
return nil
}
-func (g *gen) writeIterateRound(b *buffer, name string, body []*a.Node, round uint32, depth uint32, length int, unroll int) error {
- b.printf("%s%s.len = %d;\n", vPrefix, name, length)
+func (g *gen) writeIterateRound(b *buffer, assigns []*a.Node, body []*a.Node, round uint32, depth uint32, length int, unroll int) error {
+ for _, o := range assigns {
+ name := o.AsAssign().LHS().Ident().Str(g.tm)
+ b.printf("%s%s.len = %d;\n", vPrefix, name, length)
+ }
+ name0 := assigns[0].AsAssign().LHS().Ident().Str(g.tm)
b.printf("uint8_t* %send%d_%s = %sslice_%s.ptr + (%sslice_%s.len / %d) * %d;\n",
- iPrefix, round, name, iPrefix, name, iPrefix, name, length*unroll, length*unroll)
- b.printf("while (%s%s.ptr < %send%d_%s) {\n", vPrefix, name, iPrefix, round, name)
+ iPrefix, round, name0, iPrefix, name0, iPrefix, name0, length*unroll, length*unroll)
+ b.printf("while (%s%s.ptr < %send%d_%s) {\n", vPrefix, name0, iPrefix, round, name0)
for i := 0; i < unroll; i++ {
for _, o := range body {
if err := g.writeStatement(b, o, depth); err != nil {
return err
}
}
- b.printf("%s%s.ptr += %d;\n", vPrefix, name, length)
+ for _, o := range assigns {
+ name := o.AsAssign().LHS().Ident().Str(g.tm)
+ b.printf("%s%s.ptr += %d;\n", vPrefix, name, length)
+ }
}
b.writes("}\n")
return nil
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index ace5131..61c7dda 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -29880,6 +29880,12 @@
wuffs_base__slice_u8 a_prev);
static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_fallback(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev);
+
+static wuffs_base__empty_struct
wuffs_png__decoder__filter_4(
wuffs_png__decoder* self,
wuffs_base__slice_u8 a_curr,
@@ -29892,6 +29898,12 @@
wuffs_base__slice_u8 a_prev);
static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_4_fallback(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev);
+
+static wuffs_base__empty_struct
wuffs_png__decoder__choose_filter_implementations(
wuffs_png__decoder* self);
@@ -30071,27 +30083,27 @@
wuffs_png__decoder__filter_1_distance_4_fallback(
wuffs_png__decoder* self,
wuffs_base__slice_u8 a_curr) {
- wuffs_base__slice_u8 v_p = {0};
+ wuffs_base__slice_u8 v_c = {0};
uint8_t v_fa0 = 0;
uint8_t v_fa1 = 0;
uint8_t v_fa2 = 0;
uint8_t v_fa3 = 0;
{
- wuffs_base__slice_u8 i_slice_p = a_curr;
- v_p = i_slice_p;
- v_p.len = 4;
- uint8_t* i_end0_p = i_slice_p.ptr + (i_slice_p.len / 4) * 4;
- while (v_p.ptr < i_end0_p) {
- v_fa0 += v_p.ptr[0];
- v_p.ptr[0] = v_fa0;
- v_fa1 += v_p.ptr[1];
- v_p.ptr[1] = v_fa1;
- v_fa2 += v_p.ptr[2];
- v_p.ptr[2] = v_fa2;
- v_fa3 += v_p.ptr[3];
- v_p.ptr[3] = v_fa3;
- v_p.ptr += 4;
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c = i_slice_c;
+ v_c.len = 4;
+ uint8_t* i_end0_c = i_slice_c.ptr + (i_slice_c.len / 4) * 4;
+ while (v_c.ptr < i_end0_c) {
+ v_fa0 += v_c.ptr[0];
+ v_c.ptr[0] = v_fa0;
+ v_fa1 += v_c.ptr[1];
+ v_c.ptr[1] = v_fa1;
+ v_fa2 += v_c.ptr[2];
+ v_c.ptr[2] = v_fa2;
+ v_fa3 += v_c.ptr[3];
+ v_c.ptr[3] = v_fa3;
+ v_c.ptr += 4;
}
}
return wuffs_base__make_empty_struct();
@@ -30166,6 +30178,65 @@
return wuffs_base__make_empty_struct();
}
+// -------- func png.decoder.filter_3_distance_4_fallback
+
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_fallback(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev) {
+ wuffs_base__slice_u8 v_c = {0};
+ wuffs_base__slice_u8 v_p = {0};
+ uint8_t v_fa0 = 0;
+ uint8_t v_fa1 = 0;
+ uint8_t v_fa2 = 0;
+ uint8_t v_fa3 = 0;
+
+ if (((uint64_t)(a_prev.len)) == 0) {
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c = i_slice_c;
+ v_c.len = 4;
+ uint8_t* i_end0_c = i_slice_c.ptr + (i_slice_c.len / 4) * 4;
+ while (v_c.ptr < i_end0_c) {
+ v_fa0 = ((v_fa0 / 2) + v_c.ptr[0]);
+ v_c.ptr[0] = v_fa0;
+ v_fa1 = ((v_fa1 / 2) + v_c.ptr[1]);
+ v_c.ptr[1] = v_fa1;
+ v_fa2 = ((v_fa2 / 2) + v_c.ptr[2]);
+ v_c.ptr[2] = v_fa2;
+ v_fa3 = ((v_fa3 / 2) + v_c.ptr[3]);
+ v_c.ptr[3] = v_fa3;
+ v_c.ptr += 4;
+ }
+ }
+ } else {
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c = i_slice_c;
+ wuffs_base__slice_u8 i_slice_p = a_prev;
+ v_p = i_slice_p;
+ i_slice_c.len = wuffs_base__u64__min(i_slice_c.len, i_slice_p.len);
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end0_c = i_slice_c.ptr + (i_slice_c.len / 4) * 4;
+ while (v_c.ptr < i_end0_c) {
+ v_fa0 = (((uint8_t)(((((uint32_t)(v_fa0)) + ((uint32_t)(v_p.ptr[0]))) / 2))) + v_c.ptr[0]);
+ v_c.ptr[0] = v_fa0;
+ v_fa1 = (((uint8_t)(((((uint32_t)(v_fa1)) + ((uint32_t)(v_p.ptr[1]))) / 2))) + v_c.ptr[1]);
+ v_c.ptr[1] = v_fa1;
+ v_fa2 = (((uint8_t)(((((uint32_t)(v_fa2)) + ((uint32_t)(v_p.ptr[2]))) / 2))) + v_c.ptr[2]);
+ v_c.ptr[2] = v_fa2;
+ v_fa3 = (((uint8_t)(((((uint32_t)(v_fa3)) + ((uint32_t)(v_p.ptr[3]))) / 2))) + v_c.ptr[3]);
+ v_c.ptr[3] = v_fa3;
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ }
+ }
+ return wuffs_base__make_empty_struct();
+}
+
// -------- func png.decoder.filter_4
static wuffs_base__empty_struct
@@ -30229,6 +30300,153 @@
return wuffs_base__make_empty_struct();
}
+// -------- func png.decoder.filter_4_distance_4_fallback
+
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_4_fallback(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr,
+ wuffs_base__slice_u8 a_prev) {
+ wuffs_base__slice_u8 v_c = {0};
+ wuffs_base__slice_u8 v_p = {0};
+ uint32_t v_fa0 = 0;
+ uint32_t v_fa1 = 0;
+ uint32_t v_fa2 = 0;
+ uint32_t v_fa3 = 0;
+ uint32_t v_fb0 = 0;
+ uint32_t v_fb1 = 0;
+ uint32_t v_fb2 = 0;
+ uint32_t v_fb3 = 0;
+ uint32_t v_fc0 = 0;
+ uint32_t v_fc1 = 0;
+ uint32_t v_fc2 = 0;
+ uint32_t v_fc3 = 0;
+ uint32_t v_pp0 = 0;
+ uint32_t v_pp1 = 0;
+ uint32_t v_pp2 = 0;
+ uint32_t v_pp3 = 0;
+ uint32_t v_pa0 = 0;
+ uint32_t v_pa1 = 0;
+ uint32_t v_pa2 = 0;
+ uint32_t v_pa3 = 0;
+ uint32_t v_pb0 = 0;
+ uint32_t v_pb1 = 0;
+ uint32_t v_pb2 = 0;
+ uint32_t v_pb3 = 0;
+ uint32_t v_pc0 = 0;
+ uint32_t v_pc1 = 0;
+ uint32_t v_pc2 = 0;
+ uint32_t v_pc3 = 0;
+
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c = i_slice_c;
+ wuffs_base__slice_u8 i_slice_p = a_prev;
+ v_p = i_slice_p;
+ i_slice_c.len = wuffs_base__u64__min(i_slice_c.len, i_slice_p.len);
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end0_c = i_slice_c.ptr + (i_slice_c.len / 4) * 4;
+ while (v_c.ptr < i_end0_c) {
+ v_fb0 = ((uint32_t)(v_p.ptr[0]));
+ v_pp0 = ((v_fa0 + v_fb0) - v_fc0);
+ v_pa0 = (v_pp0 - v_fa0);
+ if (v_pa0 >= 2147483648) {
+ v_pa0 = (0 - v_pa0);
+ }
+ v_pb0 = (v_pp0 - v_fb0);
+ if (v_pb0 >= 2147483648) {
+ v_pb0 = (0 - v_pb0);
+ }
+ v_pc0 = (v_pp0 - v_fc0);
+ if (v_pc0 >= 2147483648) {
+ v_pc0 = (0 - v_pc0);
+ }
+ if ((v_pa0 <= v_pb0) && (v_pa0 <= v_pc0)) {
+ } else if (v_pb0 <= v_pc0) {
+ v_fa0 = v_fb0;
+ } else {
+ v_fa0 = v_fc0;
+ }
+ v_c.ptr[0] += ((uint8_t)((v_fa0 & 255)));
+ v_fa0 = ((uint32_t)(v_c.ptr[0]));
+ v_fc0 = v_fb0;
+ v_fb1 = ((uint32_t)(v_p.ptr[1]));
+ v_pp1 = ((v_fa1 + v_fb1) - v_fc1);
+ v_pa1 = (v_pp1 - v_fa1);
+ if (v_pa1 >= 2147483648) {
+ v_pa1 = (0 - v_pa1);
+ }
+ v_pb1 = (v_pp1 - v_fb1);
+ if (v_pb1 >= 2147483648) {
+ v_pb1 = (0 - v_pb1);
+ }
+ v_pc1 = (v_pp1 - v_fc1);
+ if (v_pc1 >= 2147483648) {
+ v_pc1 = (0 - v_pc1);
+ }
+ if ((v_pa1 <= v_pb1) && (v_pa1 <= v_pc1)) {
+ } else if (v_pb1 <= v_pc1) {
+ v_fa1 = v_fb1;
+ } else {
+ v_fa1 = v_fc1;
+ }
+ v_c.ptr[1] += ((uint8_t)((v_fa1 & 255)));
+ v_fa1 = ((uint32_t)(v_c.ptr[1]));
+ v_fc1 = v_fb1;
+ v_fb2 = ((uint32_t)(v_p.ptr[2]));
+ v_pp2 = ((v_fa2 + v_fb2) - v_fc2);
+ v_pa2 = (v_pp2 - v_fa2);
+ if (v_pa2 >= 2147483648) {
+ v_pa2 = (0 - v_pa2);
+ }
+ v_pb2 = (v_pp2 - v_fb2);
+ if (v_pb2 >= 2147483648) {
+ v_pb2 = (0 - v_pb2);
+ }
+ v_pc2 = (v_pp2 - v_fc2);
+ if (v_pc2 >= 2147483648) {
+ v_pc2 = (0 - v_pc2);
+ }
+ if ((v_pa2 <= v_pb2) && (v_pa2 <= v_pc2)) {
+ } else if (v_pb2 <= v_pc2) {
+ v_fa2 = v_fb2;
+ } else {
+ v_fa2 = v_fc2;
+ }
+ v_c.ptr[2] += ((uint8_t)((v_fa2 & 255)));
+ v_fa2 = ((uint32_t)(v_c.ptr[2]));
+ v_fc2 = v_fb2;
+ v_fb3 = ((uint32_t)(v_p.ptr[3]));
+ v_pp3 = ((v_fa3 + v_fb3) - v_fc3);
+ v_pa3 = (v_pp3 - v_fa3);
+ if (v_pa3 >= 2147483648) {
+ v_pa3 = (0 - v_pa3);
+ }
+ v_pb3 = (v_pp3 - v_fb3);
+ if (v_pb3 >= 2147483648) {
+ v_pb3 = (0 - v_pb3);
+ }
+ v_pc3 = (v_pp3 - v_fc3);
+ if (v_pc3 >= 2147483648) {
+ v_pc3 = (0 - v_pc3);
+ }
+ if ((v_pa3 <= v_pb3) && (v_pa3 <= v_pc3)) {
+ } else if (v_pb3 <= v_pc3) {
+ v_fa3 = v_fb3;
+ } else {
+ v_fa3 = v_fc3;
+ }
+ v_c.ptr[3] += ((uint8_t)((v_fa3 & 255)));
+ v_fa3 = ((uint32_t)(v_c.ptr[3]));
+ v_fc3 = v_fb3;
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ }
+ return wuffs_base__make_empty_struct();
+}
+
// -------- func png.decoder.set_quirk_enabled
WUFFS_BASE__MAYBE_STATIC wuffs_base__empty_struct
@@ -30714,6 +30932,8 @@
if (self->private_impl.f_filter_distance == 3) {
} else if (self->private_impl.f_filter_distance == 4) {
self->private_impl.choosy_filter_1 = &wuffs_png__decoder__filter_1_distance_4_fallback;
+ self->private_impl.choosy_filter_3 = &wuffs_png__decoder__filter_3_distance_4_fallback;
+ self->private_impl.choosy_filter_4 = &wuffs_png__decoder__filter_4_distance_4_fallback;
}
return wuffs_base__make_empty_struct();
}
diff --git a/std/png/decode_filter_fallback.wuffs b/std/png/decode_filter_fallback.wuffs
index 972d990..b879ea6 100644
--- a/std/png/decode_filter_fallback.wuffs
+++ b/std/png/decode_filter_fallback.wuffs
@@ -38,21 +38,21 @@
}
pri func decoder.filter_1_distance_4_fallback!(curr: slice base.u8) {
- var p : slice base.u8
+ var c : slice base.u8
var fa0 : base.u8
var fa1 : base.u8
var fa2 : base.u8
var fa3 : base.u8
- iterate (p = args.curr)(length: 4, unroll: 1) {
- fa0 ~mod+= p[0]
- p[0] = fa0
- fa1 ~mod+= p[1]
- p[1] = fa1
- fa2 ~mod+= p[2]
- p[2] = fa2
- fa3 ~mod+= p[3]
- p[3] = fa3
+ iterate (c = args.curr)(length: 4, unroll: 1) {
+ fa0 ~mod+= c[0]
+ c[0] = fa0
+ fa1 ~mod+= c[1]
+ c[1] = fa1
+ fa2 ~mod+= c[2]
+ c[2] = fa2
+ fa3 ~mod+= c[3]
+ c[3] = fa3
}
}
@@ -131,6 +131,40 @@
}
}
+pri func decoder.filter_3_distance_4_fallback!(curr: slice base.u8, prev: slice base.u8) {
+ var c : slice base.u8
+ var p : slice base.u8
+ var fa0 : base.u8
+ var fa1 : base.u8
+ var fa2 : base.u8
+ var fa3 : base.u8
+
+ if args.prev.length() == 0 {
+ iterate (c = args.curr)(length: 4, unroll: 1) {
+ fa0 = (fa0 / 2) ~mod+ c[0]
+ c[0] = fa0
+ fa1 = (fa1 / 2) ~mod+ c[1]
+ c[1] = fa1
+ fa2 = (fa2 / 2) ~mod+ c[2]
+ c[2] = fa2
+ fa3 = (fa3 / 2) ~mod+ c[3]
+ c[3] = fa3
+ }
+
+ } else {
+ iterate (c = args.curr, p = args.prev)(length: 4, unroll: 1) {
+ fa0 = ((((fa0 as base.u32) + (p[0] as base.u32)) / 2) as base.u8) ~mod+ c[0]
+ c[0] = fa0
+ fa1 = ((((fa1 as base.u32) + (p[1] as base.u32)) / 2) as base.u8) ~mod+ c[1]
+ c[1] = fa1
+ fa2 = ((((fa2 as base.u32) + (p[2] as base.u32)) / 2) as base.u8) ~mod+ c[2]
+ c[2] = fa2
+ fa3 = ((((fa3 as base.u32) + (p[3] as base.u32)) / 2) as base.u8) ~mod+ c[3]
+ c[3] = fa3
+ }
+ }
+}
+
pri func decoder.filter_4!(curr: slice base.u8, prev: slice base.u8),
choosy,
{
@@ -200,3 +234,138 @@
assert i >= filter_distance via "a >= b: a >= (b + c); 0 <= c"(c: 1)
} endwhile
}
+
+pri func decoder.filter_4_distance_4_fallback!(curr: slice base.u8, prev: slice base.u8) {
+ var c : slice base.u8
+ var p : slice base.u8
+ var fa0 : base.u32
+ var fa1 : base.u32
+ var fa2 : base.u32
+ var fa3 : base.u32
+ var fb0 : base.u32
+ var fb1 : base.u32
+ var fb2 : base.u32
+ var fb3 : base.u32
+ var fc0 : base.u32
+ var fc1 : base.u32
+ var fc2 : base.u32
+ var fc3 : base.u32
+ var pp0 : base.u32
+ var pp1 : base.u32
+ var pp2 : base.u32
+ var pp3 : base.u32
+ var pa0 : base.u32
+ var pa1 : base.u32
+ var pa2 : base.u32
+ var pa3 : base.u32
+ var pb0 : base.u32
+ var pb1 : base.u32
+ var pb2 : base.u32
+ var pb3 : base.u32
+ var pc0 : base.u32
+ var pc1 : base.u32
+ var pc2 : base.u32
+ var pc3 : base.u32
+
+ iterate (c = args.curr, p = args.prev)(length: 4, unroll: 1) {
+ fb0 = p[0] as base.u32
+ pp0 = (fa0 ~mod+ fb0) ~mod- fc0
+ pa0 = pp0 ~mod- fa0
+ if pa0 >= 0x8000_0000 {
+ pa0 = 0 ~mod- pa0
+ }
+ pb0 = pp0 ~mod- fb0
+ if pb0 >= 0x8000_0000 {
+ pb0 = 0 ~mod- pb0
+ }
+ pc0 = pp0 ~mod- fc0
+ if pc0 >= 0x8000_0000 {
+ pc0 = 0 ~mod- pc0
+ }
+ if (pa0 <= pb0) and (pa0 <= pc0) {
+ // No-op.
+ } else if pb0 <= pc0 {
+ fa0 = fb0
+ } else {
+ fa0 = fc0
+ }
+ c[0] ~mod+= (fa0 & 0xFF) as base.u8
+ fa0 = c[0] as base.u32
+ fc0 = fb0
+
+ fb1 = p[1] as base.u32
+ pp1 = (fa1 ~mod+ fb1) ~mod- fc1
+ pa1 = pp1 ~mod- fa1
+ if pa1 >= 0x8000_0000 {
+ pa1 = 0 ~mod- pa1
+ }
+ pb1 = pp1 ~mod- fb1
+ if pb1 >= 0x8000_0000 {
+ pb1 = 0 ~mod- pb1
+ }
+ pc1 = pp1 ~mod- fc1
+ if pc1 >= 0x8000_0000 {
+ pc1 = 0 ~mod- pc1
+ }
+ if (pa1 <= pb1) and (pa1 <= pc1) {
+ // No-op.
+ } else if pb1 <= pc1 {
+ fa1 = fb1
+ } else {
+ fa1 = fc1
+ }
+ c[1] ~mod+= (fa1 & 0xFF) as base.u8
+ fa1 = c[1] as base.u32
+ fc1 = fb1
+
+ fb2 = p[2] as base.u32
+ pp2 = (fa2 ~mod+ fb2) ~mod- fc2
+ pa2 = pp2 ~mod- fa2
+ if pa2 >= 0x8000_0000 {
+ pa2 = 0 ~mod- pa2
+ }
+ pb2 = pp2 ~mod- fb2
+ if pb2 >= 0x8000_0000 {
+ pb2 = 0 ~mod- pb2
+ }
+ pc2 = pp2 ~mod- fc2
+ if pc2 >= 0x8000_0000 {
+ pc2 = 0 ~mod- pc2
+ }
+ if (pa2 <= pb2) and (pa2 <= pc2) {
+ // No-op.
+ } else if pb2 <= pc2 {
+ fa2 = fb2
+ } else {
+ fa2 = fc2
+ }
+ c[2] ~mod+= (fa2 & 0xFF) as base.u8
+ fa2 = c[2] as base.u32
+ fc2 = fb2
+
+ fb3 = p[3] as base.u32
+ pp3 = (fa3 ~mod+ fb3) ~mod- fc3
+ pa3 = pp3 ~mod- fa3
+ if pa3 >= 0x8000_0000 {
+ pa3 = 0 ~mod- pa3
+ }
+ pb3 = pp3 ~mod- fb3
+ if pb3 >= 0x8000_0000 {
+ pb3 = 0 ~mod- pb3
+ }
+ pc3 = pp3 ~mod- fc3
+ if pc3 >= 0x8000_0000 {
+ pc3 = 0 ~mod- pc3
+ }
+ if (pa3 <= pb3) and (pa3 <= pc3) {
+ // No-op.
+ } else if pb3 <= pc3 {
+ fa3 = fb3
+ } else {
+ fa3 = fc3
+ }
+ c[3] ~mod+= (fa3 & 0xFF) as base.u8
+ fa3 = c[3] as base.u32
+ fc3 = fb3
+ }
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index 8a89283..79d3100 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -246,10 +246,14 @@
}
pri func decoder.choose_filter_implementations!() {
+ // Filter 0 is a no-op. Filter 2, the up filter, should already vectorize
+ // easily by a good optimizing C compiler.
if this.filter_distance == 3 {
// TODO.
} else if this.filter_distance == 4 {
choose filter_1 = [filter_1_distance_4_fallback]
+ choose filter_3 = [filter_3_distance_4_fallback]
+ choose filter_4 = [filter_4_distance_4_fallback]
}
}