Factor out png.decoder.filter_N functions wuffs_png_decode_19k_8bpp/clang9 92.1MB/s ± 0% 95.7MB/s ± 0% +3.89% (p=0.008 n=5+5) wuffs_png_decode_40k_24bpp/clang9 103MB/s ± 0% 110MB/s ± 0% +6.81% (p=0.008 n=5+5) wuffs_png_decode_77k_8bpp/clang9 348MB/s ± 0% 358MB/s ± 0% +2.90% (p=0.008 n=5+5) wuffs_png_decode_552k_32bpp/clang9 138MB/s ± 0% 136MB/s ± 0% -1.92% (p=0.008 n=5+5) wuffs_png_decode_4002k_24bpp/clang9 103MB/s ± 0% 111MB/s ± 0% +8.04% (p=0.008 n=5+5) wuffs_png_decode_filter_1_sub/clang9 738MB/s ± 0% 646MB/s ± 0% -12.51% (p=0.008 n=5+5) wuffs_png_decode_filter_2_up/clang9 13.4GB/s ± 0% 13.1GB/s ± 0% -2.51% (p=0.008 n=5+5) wuffs_png_decode_filter_3_average/clang9 329MB/s ± 0% 481MB/s ± 0% +46.25% (p=0.008 n=5+5) wuffs_png_decode_filter_4_paeth/clang9 70.0MB/s ± 0% 65.3MB/s ± 0% -6.80% (p=0.008 n=5+5) wuffs_png_decode_19k_8bpp/gcc10 98.0MB/s ± 0% 98.0MB/s ± 0% ~ (p=0.690 n=5+5) wuffs_png_decode_40k_24bpp/gcc10 118MB/s ± 0% 118MB/s ± 0% +0.43% (p=0.016 n=5+5) wuffs_png_decode_77k_8bpp/gcc10 318MB/s ± 0% 317MB/s ± 0% -0.19% (p=0.008 n=5+5) wuffs_png_decode_552k_32bpp/gcc10 161MB/s ± 0% 159MB/s ± 0% -1.38% (p=0.008 n=5+5) wuffs_png_decode_4002k_24bpp/gcc10 118MB/s ± 0% 118MB/s ± 0% ~ (p=0.056 n=5+5) wuffs_png_decode_filter_1_sub/gcc10 932MB/s ± 0% 931MB/s ± 0% ~ (p=0.151 n=5+5) wuffs_png_decode_filter_2_up/gcc10 11.0GB/s ± 1% 11.4GB/s ± 0% +3.32% (p=0.016 n=4+5) wuffs_png_decode_filter_3_average/gcc10 645MB/s ± 0% 643MB/s ± 0% -0.29% (p=0.008 n=5+5) wuffs_png_decode_filter_4_paeth/gcc10 90.1MB/s ± 0% 83.6MB/s ± 0% -7.24% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c index 35ab8f5..a79fa07 100644 --- a/release/c/wuffs-unsupported-snapshot.c +++ b/release/c/wuffs-unsupported-snapshot.c
@@ -29836,6 +29836,29 @@ wuffs_base__pixel_buffer* a_dst, wuffs_base__slice_u8 a_workbuf); +static wuffs_base__empty_struct +wuffs_png__decoder__filter_1( + wuffs_png__decoder* self, + wuffs_base__slice_u8 a_curr); + +static wuffs_base__empty_struct +wuffs_png__decoder__filter_2( + wuffs_png__decoder* self, + wuffs_base__slice_u8 a_curr, + wuffs_base__slice_u8 a_prev); + +static wuffs_base__empty_struct +wuffs_png__decoder__filter_3( + wuffs_png__decoder* self, + wuffs_base__slice_u8 a_curr, + wuffs_base__slice_u8 a_prev); + +static wuffs_base__empty_struct +wuffs_png__decoder__filter_4( + wuffs_png__decoder* self, + wuffs_base__slice_u8 a_curr, + wuffs_base__slice_u8 a_prev); + // ---------------- VTables const wuffs_base__image_decoder__func_ptrs @@ -30942,20 +30965,11 @@ uint64_t v_dst_bytes_per_row = 0; wuffs_base__slice_u8 v_dst_palette = {0}; wuffs_base__table_u8 v_tab = {0}; - uint64_t v_filter_distance = 0; uint32_t v_y = 0; wuffs_base__slice_u8 v_dst = {0}; uint8_t v_filter = 0; wuffs_base__slice_u8 v_curr_row = {0}; wuffs_base__slice_u8 v_prev_row = {0}; - uint64_t v_i = 0; - uint32_t v_fa = 0; - uint32_t v_fb = 0; - uint32_t v_fc = 0; - uint32_t v_pp = 0; - uint32_t v_pa = 0; - uint32_t v_pb = 0; - uint32_t v_pc = 0; v_dst_pixfmt = wuffs_base__pixel_buffer__pixel_format(a_dst); v_dst_bits_per_pixel = wuffs_base__pixel_format__bits_per_pixel(&v_dst_pixfmt); @@ -30966,7 +30980,6 @@ v_dst_bytes_per_row = (((uint64_t)(self->private_impl.f_width)) * v_dst_bytes_per_pixel); v_dst_palette = wuffs_base__pixel_buffer__palette_or_else(a_dst, wuffs_base__make_slice_u8(self->private_data.f_dst_palette, 1024)); v_tab = wuffs_base__pixel_buffer__plane(a_dst, 0); - v_filter_distance = ((uint64_t)(self->private_impl.f_filter_distance)); while (v_y < self->private_impl.f_height) { v_dst = wuffs_base__table_u8__row(v_tab, v_y); if (v_dst_bytes_per_row < ((uint64_t)(v_dst.len))) { @@ -30984,79 +30997,13 @@ a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, self->private_impl.f_bytes_per_row); if (v_filter == 0) { } else if (v_filter == 1) { - v_i = v_filter_distance; - while (v_i < ((uint64_t)(v_curr_row.len))) { - if (v_i >= v_filter_distance) { - if ((v_i - v_filter_distance) < ((uint64_t)(v_curr_row.len))) { - v_curr_row.ptr[v_i] += v_curr_row.ptr[(v_i - v_filter_distance)]; - } - } - v_i += 1; - } + wuffs_png__decoder__filter_1(self, v_curr_row); } else if (v_filter == 2) { - v_i = 0; - while ((v_i < ((uint64_t)(v_curr_row.len))) && (v_i < ((uint64_t)(v_prev_row.len)))) { - v_curr_row.ptr[v_i] += v_prev_row.ptr[v_i]; - v_i += 1; - } + wuffs_png__decoder__filter_2(self, v_curr_row, v_prev_row); } else if (v_filter == 3) { - if (v_y == 0) { - v_i = v_filter_distance; - while (v_i < ((uint64_t)(v_curr_row.len))) { - if (v_i >= v_filter_distance) { - if ((v_i - v_filter_distance) < ((uint64_t)(v_curr_row.len))) { - v_curr_row.ptr[v_i] += (v_curr_row.ptr[(v_i - v_filter_distance)] / 2); - } - } - v_i += 1; - } - } else { - v_i = 0; - while ((v_i < ((uint64_t)(v_curr_row.len))) && (v_i < ((uint64_t)(v_prev_row.len)))) { - if (v_i >= v_filter_distance) { - if ((v_i - v_filter_distance) < ((uint64_t)(v_curr_row.len))) { - v_curr_row.ptr[v_i] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[(v_i - v_filter_distance)])) + ((uint32_t)(v_prev_row.ptr[v_i]))) / 2))); - } - } else { - v_curr_row.ptr[v_i] += (v_prev_row.ptr[v_i] / 2); - } - v_i += 1; - } - } + wuffs_png__decoder__filter_3(self, v_curr_row, v_prev_row); } else if (v_filter == 4) { - v_i = 0; - while ((v_i < ((uint64_t)(v_curr_row.len))) && (v_i < ((uint64_t)(v_prev_row.len)))) { - if (v_i < v_filter_distance) { - v_curr_row.ptr[v_i] += v_prev_row.ptr[v_i]; - } else { - if (((v_i - v_filter_distance) < ((uint64_t)(v_curr_row.len))) && ((v_i - v_filter_distance) < ((uint64_t)(v_prev_row.len)))) { - v_fa = ((uint32_t)(v_curr_row.ptr[(v_i - v_filter_distance)])); - v_fb = ((uint32_t)(v_prev_row.ptr[v_i])); - v_fc = ((uint32_t)(v_prev_row.ptr[(v_i - v_filter_distance)])); - v_pp = ((v_fa + v_fb) - v_fc); - v_pa = (v_pp - v_fa); - if (v_pa >= 2147483648) { - v_pa = (0 - v_pa); - } - v_pb = (v_pp - v_fb); - if (v_pb >= 2147483648) { - v_pb = (0 - v_pb); - } - v_pc = (v_pp - v_fc); - if (v_pc >= 2147483648) { - v_pc = (0 - v_pc); - } - if ((v_pa <= v_pb) && (v_pa <= v_pc)) { - v_curr_row.ptr[v_i] += ((uint8_t)((v_fa & 255))); - } else if (v_pb <= v_pc) { - v_curr_row.ptr[v_i] += ((uint8_t)((v_fb & 255))); - } else { - v_curr_row.ptr[v_i] += ((uint8_t)((v_fc & 255))); - } - } - } - v_i += 1; - } + wuffs_png__decoder__filter_4(self, v_curr_row, v_prev_row); } else { return wuffs_base__make_status(wuffs_png__error__bad_filter); } @@ -31067,6 +31014,136 @@ return wuffs_base__make_status(NULL); } +// -------- func png.decoder.filter_1 + +static wuffs_base__empty_struct +wuffs_png__decoder__filter_1( + wuffs_png__decoder* self, + wuffs_base__slice_u8 a_curr) { + uint64_t v_filter_distance = 0; + uint64_t v_i = 0; + + v_filter_distance = ((uint64_t)(self->private_impl.f_filter_distance)); + v_i = v_filter_distance; + while (v_i < ((uint64_t)(a_curr.len))) { + if (v_i >= v_filter_distance) { + if ((v_i - v_filter_distance) < ((uint64_t)(a_curr.len))) { + a_curr.ptr[v_i] += a_curr.ptr[(v_i - v_filter_distance)]; + } + } + v_i += 1; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func png.decoder.filter_2 + +static wuffs_base__empty_struct +wuffs_png__decoder__filter_2( + wuffs_png__decoder* self, + wuffs_base__slice_u8 a_curr, + wuffs_base__slice_u8 a_prev) { + uint64_t v_i = 0; + + v_i = 0; + while ((v_i < ((uint64_t)(a_curr.len))) && (v_i < ((uint64_t)(a_prev.len)))) { + a_curr.ptr[v_i] += a_prev.ptr[v_i]; + v_i += 1; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func png.decoder.filter_3 + +static wuffs_base__empty_struct +wuffs_png__decoder__filter_3( + wuffs_png__decoder* self, + wuffs_base__slice_u8 a_curr, + wuffs_base__slice_u8 a_prev) { + uint64_t v_filter_distance = 0; + uint64_t v_i = 0; + + v_filter_distance = ((uint64_t)(self->private_impl.f_filter_distance)); + if (((uint64_t)(a_prev.len)) == 0) { + v_i = v_filter_distance; + while (v_i < ((uint64_t)(a_curr.len))) { + if (v_i >= v_filter_distance) { + if ((v_i - v_filter_distance) < ((uint64_t)(a_curr.len))) { + a_curr.ptr[v_i] += (a_curr.ptr[(v_i - v_filter_distance)] / 2); + } + } + v_i += 1; + } + } else { + v_i = 0; + while ((v_i < ((uint64_t)(a_curr.len))) && (v_i < ((uint64_t)(a_prev.len)))) { + if (v_i >= v_filter_distance) { + if ((v_i - v_filter_distance) < ((uint64_t)(a_curr.len))) { + a_curr.ptr[v_i] += ((uint8_t)(((((uint32_t)(a_curr.ptr[(v_i - v_filter_distance)])) + ((uint32_t)(a_prev.ptr[v_i]))) / 2))); + } + } else { + a_curr.ptr[v_i] += (a_prev.ptr[v_i] / 2); + } + v_i += 1; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func png.decoder.filter_4 + +static wuffs_base__empty_struct +wuffs_png__decoder__filter_4( + wuffs_png__decoder* self, + wuffs_base__slice_u8 a_curr, + wuffs_base__slice_u8 a_prev) { + uint64_t v_filter_distance = 0; + uint64_t v_i = 0; + uint32_t v_fa = 0; + uint32_t v_fb = 0; + uint32_t v_fc = 0; + uint32_t v_pp = 0; + uint32_t v_pa = 0; + uint32_t v_pb = 0; + uint32_t v_pc = 0; + + v_filter_distance = ((uint64_t)(self->private_impl.f_filter_distance)); + v_i = 0; + while ((v_i < ((uint64_t)(a_curr.len))) && (v_i < ((uint64_t)(a_prev.len)))) { + if (v_i < v_filter_distance) { + a_curr.ptr[v_i] += a_prev.ptr[v_i]; + } else { + if (((v_i - v_filter_distance) < ((uint64_t)(a_curr.len))) && ((v_i - v_filter_distance) < ((uint64_t)(a_prev.len)))) { + v_fa = ((uint32_t)(a_curr.ptr[(v_i - v_filter_distance)])); + v_fb = ((uint32_t)(a_prev.ptr[v_i])); + v_fc = ((uint32_t)(a_prev.ptr[(v_i - v_filter_distance)])); + v_pp = ((v_fa + v_fb) - v_fc); + v_pa = (v_pp - v_fa); + if (v_pa >= 2147483648) { + v_pa = (0 - v_pa); + } + v_pb = (v_pp - v_fb); + if (v_pb >= 2147483648) { + v_pb = (0 - v_pb); + } + v_pc = (v_pp - v_fc); + if (v_pc >= 2147483648) { + v_pc = (0 - v_pc); + } + if ((v_pa <= v_pb) && (v_pa <= v_pc)) { + a_curr.ptr[v_i] += ((uint8_t)((v_fa & 255))); + } else if (v_pb <= v_pc) { + a_curr.ptr[v_i] += ((uint8_t)((v_fb & 255))); + } else { + a_curr.ptr[v_i] += ((uint8_t)((v_fc & 255))); + } + } + } + v_i += 1; + } + return wuffs_base__make_empty_struct(); +} + // -------- func png.decoder.frame_dirty_rect WUFFS_BASE__MAYBE_STATIC wuffs_base__rect_ie_u32
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs index 64bdc1d..0e9b4cb 100644 --- a/std/png/decode_png.wuffs +++ b/std/png/decode_png.wuffs
@@ -398,7 +398,6 @@ var dst_bytes_per_row : base.u64 var dst_palette : slice base.u8 var tab : table base.u8 - var filter_distance : base.u64[..= 8] var y : base.u32 var dst : slice base.u8 @@ -406,15 +405,6 @@ var curr_row : slice base.u8 var prev_row : slice base.u8 - var i : base.u64 - var fa : base.u32 - var fb : base.u32 - var fc : base.u32 - var pp : base.u32 - var pa : base.u32 - var pb : base.u32 - var pc : base.u32 - // TODO: the dst_pixfmt variable shouldn't be necessary. We should be able // to chain the two calls: "args.dst.pixel_format().bits_per_pixel()". dst_pixfmt = args.dst.pixel_format() @@ -426,7 +416,6 @@ dst_bytes_per_row = (this.width as base.u64) * dst_bytes_per_pixel dst_palette = args.dst.palette_or_else(fallback: this.dst_palette[..]) tab = args.dst.plane(p: 0) - filter_distance = this.filter_distance as base.u64 while y < this.height { assert y < 0xFFFF_FFFF via "a < b: a < c; c <= b"(c: this.height) @@ -449,97 +438,13 @@ if filter == 0 { // No-op. } else if filter == 1 { - i = filter_distance - while i < curr_row.length(), - inv y < 0xFFFF_FFFF, - { - assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: curr_row.length()) - if i >= filter_distance { - if (i - filter_distance) < curr_row.length() { - curr_row[i] ~mod+= curr_row[i - filter_distance] - } - } - i += 1 - } endwhile + this.filter_1!(curr: curr_row) } else if filter == 2 { - i = 0 - while (i < curr_row.length()) and (i < prev_row.length()), - inv y < 0xFFFF_FFFF, - { - assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: curr_row.length()) - curr_row[i] ~mod+= prev_row[i] - i += 1 - } endwhile + this.filter_2!(curr: curr_row, prev: prev_row) } else if filter == 3 { - if y == 0 { - i = filter_distance - while i < curr_row.length(), - inv y < 0xFFFF_FFFF, - { - assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: curr_row.length()) - if i >= filter_distance { - if (i - filter_distance) < curr_row.length() { - curr_row[i] ~mod+= curr_row[i - filter_distance] / 2 - } - } - i += 1 - } endwhile - } else { - i = 0 - while (i < curr_row.length()) and (i < prev_row.length()), - inv y < 0xFFFF_FFFF, - { - assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: curr_row.length()) - if i >= filter_distance { - if (i - filter_distance) < curr_row.length() { - curr_row[i] ~mod+= (( - (curr_row[i - filter_distance] as base.u32) + - (prev_row[i] as base.u32)) / 2) as base.u8 - } - } else { - curr_row[i] ~mod+= prev_row[i] / 2 - } - i += 1 - } endwhile - } + this.filter_3!(curr: curr_row, prev: prev_row) } else if filter == 4 { - i = 0 - while (i < curr_row.length()) and (i < prev_row.length()), - inv y < 0xFFFF_FFFF, - { - assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: curr_row.length()) - if i < filter_distance { - curr_row[i] ~mod+= prev_row[i] - } else { - if ((i - filter_distance) < curr_row.length()) and - ((i - filter_distance) < prev_row.length()) { - fa = curr_row[i - filter_distance] as base.u32 - fb = prev_row[i] as base.u32 - fc = prev_row[i - filter_distance] as base.u32 - pp = (fa ~mod+ fb) ~mod- fc - pa = pp ~mod- fa - if pa >= 0x8000_0000 { - pa = 0 ~mod- pa - } - pb = pp ~mod- fb - if pb >= 0x8000_0000 { - pb = 0 ~mod- pb - } - pc = pp ~mod- fc - if pc >= 0x8000_0000 { - pc = 0 ~mod- pc - } - if (pa <= pb) and (pa <= pc) { - curr_row[i] ~mod+= (fa & 0xFF) as base.u8 - } else if pb <= pc { - curr_row[i] ~mod+= (fb & 0xFF) as base.u8 - } else { - curr_row[i] ~mod+= (fc & 0xFF) as base.u8 - } - } - } - i += 1 - } endwhile + this.filter_4!(curr: curr_row, prev: prev_row) } else { return "#bad filter" } @@ -556,6 +461,118 @@ return ok } +pri func decoder.filter_1!(curr: slice base.u8) { + var filter_distance : base.u64[..= 8] + var i : base.u64 + + filter_distance = this.filter_distance as base.u64 + i = filter_distance + while i < args.curr.length() { + assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: args.curr.length()) + if i >= filter_distance { + if (i - filter_distance) < args.curr.length() { + args.curr[i] ~mod+= args.curr[i - filter_distance] + } + } + i += 1 + } endwhile +} + +pri func decoder.filter_2!(curr: slice base.u8, prev: slice base.u8) { + var i : base.u64 + + i = 0 + while (i < args.curr.length()) and (i < args.prev.length()) { + assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: args.curr.length()) + args.curr[i] ~mod+= args.prev[i] + i += 1 + } endwhile +} + +pri func decoder.filter_3!(curr: slice base.u8, prev: slice base.u8) { + var filter_distance : base.u64[..= 8] + var i : base.u64 + + filter_distance = this.filter_distance as base.u64 + if args.prev.length() == 0 { + i = filter_distance + while i < args.curr.length() { + assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: args.curr.length()) + if i >= filter_distance { + if (i - filter_distance) < args.curr.length() { + args.curr[i] ~mod+= args.curr[i - filter_distance] / 2 + } + } + i += 1 + } endwhile + } else { + i = 0 + while (i < args.curr.length()) and (i < args.prev.length()) { + assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: args.curr.length()) + if i >= filter_distance { + if (i - filter_distance) < args.curr.length() { + args.curr[i] ~mod+= (( + (args.curr[i - filter_distance] as base.u32) + + (args.prev[i] as base.u32)) / 2) as base.u8 + } + } else { + args.curr[i] ~mod+= args.prev[i] / 2 + } + i += 1 + } endwhile + } +} + +pri func decoder.filter_4!(curr: slice base.u8, prev: slice base.u8) { + var filter_distance : base.u64[..= 8] + var i : base.u64 + + var fa : base.u32 + var fb : base.u32 + var fc : base.u32 + var pp : base.u32 + var pa : base.u32 + var pb : base.u32 + var pc : base.u32 + + filter_distance = this.filter_distance as base.u64 + i = 0 + while (i < args.curr.length()) and (i < args.prev.length()) { + assert i < 0xFFFF_FFFF_FFFF_FFFF via "a < b: a < c; c <= b"(c: args.curr.length()) + if i < filter_distance { + args.curr[i] ~mod+= args.prev[i] + } else { + if ((i - filter_distance) < args.curr.length()) and + ((i - filter_distance) < args.prev.length()) { + fa = args.curr[i - filter_distance] as base.u32 + fb = args.prev[i] as base.u32 + fc = args.prev[i - filter_distance] as base.u32 + pp = (fa ~mod+ fb) ~mod- fc + pa = pp ~mod- fa + if pa >= 0x8000_0000 { + pa = 0 ~mod- pa + } + pb = pp ~mod- fb + if pb >= 0x8000_0000 { + pb = 0 ~mod- pb + } + pc = pp ~mod- fc + if pc >= 0x8000_0000 { + pc = 0 ~mod- pc + } + if (pa <= pb) and (pa <= pc) { + args.curr[i] ~mod+= (fa & 0xFF) as base.u8 + } else if pb <= pc { + args.curr[i] ~mod+= (fb & 0xFF) as base.u8 + } else { + args.curr[i] ~mod+= (fc & 0xFF) as base.u8 + } + } + } + i += 1 + } endwhile +} + pub func decoder.frame_dirty_rect() base.rect_ie_u32 { return this.util.make_rect_ie_u32( min_incl_x: 0,