| // Copyright 2018 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // ---------------- |
| |
| // This file contains a hand-written C benchmark of different strategies for |
| // decoding PNG data. |
| // |
| // For a PNG image with width W and height H, the H rows can be decompressed |
| // one-at-a-time or all-at-once. Roughly speaking, this corresponds to H versus |
| // 1 call into the zlib decoder. The former (call it "fragmented dst") requires |
| // less scratch-space memory than the latter ("full dst"): 2 * bytes_per_row |
| // instead of H * bytes_per row, but the latter can be faster. |
| // |
| // The zlib-compressed data can be split into multiple IDAT chunks. Similarly, |
| // these chunks can be decompressed separately ("fragmented IDAT") or together |
| // ("full IDAT"), again providing a memory vs speed trade-off. |
| // |
| // This program reports the speed of combining the independent frag/full dst |
| // and frag/full IDAT techniques. |
| // |
| // For example, with gcc 7.3 (and -O3) as of January 2019: |
| // |
| // On ../test/data/hat.png (90 × 112 pixels): |
| // name time/op relative |
| // FragDstFragIDAT/gcc 289µs ± 1% 1.00x |
| // FragDstFullIDAT/gcc 288µs ± 0% 1.00x |
| // FullDstFragIDAT/gcc 149µs ± 1% 1.93x |
| // FullDstFullIDAT/gcc 148µs ± 1% 1.95x |
| // |
| // On ../test/data/hibiscus.regular.png (312 × 442 pixels): |
| // name time/op relative |
| // FragDstFragIDAT/gcc 2.49ms ± 0% 1.00x |
| // FragDstFullIDAT/gcc 2.49ms ± 0% 1.00x |
| // FullDstFragIDAT/gcc 2.08ms ± 0% 1.20x |
| // FullDstFullIDAT/gcc 2.02ms ± 1% 1.23x |
| // |
| // On ../test/data/harvesters.png (1165 × 859 pixels): |
| // name time/op relative |
| // FragDstFragIDAT/gcc 15.6ms ± 2% 1.00x |
| // FragDstFullIDAT/gcc 15.4ms ± 0% 1.01x |
| // FullDstFragIDAT/gcc 14.4ms ± 0% 1.08x |
| // FullDstFullIDAT/gcc 14.1ms ± 0% 1.10x |
| |
| #include <errno.h> |
| #include <inttypes.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <sys/time.h> |
| #include <unistd.h> |
| |
| // Wuffs ships as a "single file C library" or "header file library" as per |
| // https://github.com/nothings/stb/blob/master/docs/stb_howto.txt |
| // |
| // To use that single file as a "foo.c"-like implementation, instead of a |
| // "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or |
| // compiling it. |
| #define WUFFS_IMPLEMENTATION |
| |
| // Defining the WUFFS_CONFIG__STATIC_FUNCTIONS macro is optional, but when |
| // combined with WUFFS_IMPLEMENTATION, it demonstrates making all of Wuffs' |
| // functions have static storage. |
| // |
| // This can help the compiler ignore or discard unused code, which can produce |
| // faster compiles and smaller binaries. Other motivations are discussed in the |
| // "ALLOW STATIC IMPLEMENTATION" section of |
| // https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt |
| #define WUFFS_CONFIG__STATIC_FUNCTIONS |
| |
| // If building this program in an environment that doesn't easily accommodate |
| // relative includes, you can use the script/inline-c-relative-includes.go |
| // program to generate a stand-alone C file. |
| #include "../release/c/wuffs-unsupported-snapshot.c" |
| |
| // The order matters here. Clang also defines "__GNUC__". |
| #if defined(__clang__) |
| const char* g_cc = "clang"; |
| const char* g_cc_version = __clang_version__; |
| #elif defined(__GNUC__) |
| const char* g_cc = "gcc"; |
| const char* g_cc_version = __VERSION__; |
| #elif defined(_MSC_VER) |
| const char* g_cc = "cl"; |
| const char* g_cc_version = "???"; |
| #else |
| const char* g_cc = "cc"; |
| const char* g_cc_version = "???"; |
| #endif |
| |
| static inline uint32_t // |
| load_u32be(uint8_t* p) { |
| return ((uint32_t)(p[0]) << 24) | ((uint32_t)(p[1]) << 16) | |
| ((uint32_t)(p[2]) << 8) | ((uint32_t)(p[3]) << 0); |
| } |
| |
| // Limit the input PNG image (and therefore its IDAT data) to (64 MiB - 1 byte) |
| // compressed, in up to 1024 IDAT chunks, and 256 MiB and 16384 × 16384 pixels |
| // uncompressed. This is a limitation of this program (which uses the Wuffs |
| // standard library), not a limitation of Wuffs per se. |
| #define DST_BUFFER_ARRAY_SIZE (256 * 1024 * 1024) |
| #define SRC_BUFFER_ARRAY_SIZE (64 * 1024 * 1024) |
| #define MAX_DIMENSION (16384) |
| #define MAX_IDAT_CHUNKS (1024) |
| |
| uint8_t g_dst_buffer_array[DST_BUFFER_ARRAY_SIZE] = {0}; |
| size_t g_dst_len = 0; |
| uint8_t g_src_buffer_array[SRC_BUFFER_ARRAY_SIZE] = {0}; |
| size_t g_src_len = 0; |
| uint8_t g_idat_buffer_array[SRC_BUFFER_ARRAY_SIZE] = {0}; |
| // The n'th IDAT chunk data (where n is a zero-based count) is in |
| // g_idat_buffer_array[i:j], where i = g_idat_splits[n+0] and j = |
| // g_idat_splits[n+1]. |
| size_t g_idat_splits[MAX_IDAT_CHUNKS + 1] = {0}; |
| uint32_t g_num_idat_chunks = 0; |
| |
| #define WORK_BUFFER_ARRAY_SIZE \ |
| WUFFS_ZLIB__DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE |
| #if WORK_BUFFER_ARRAY_SIZE > 0 |
| uint8_t g_work_buffer_array[WORK_BUFFER_ARRAY_SIZE]; |
| #else |
| // Not all C/C++ compilers support 0-length arrays. |
| uint8_t g_work_buffer_array[1]; |
| #endif |
| |
| uint32_t g_width = 0; |
| uint32_t g_height = 0; |
| uint64_t g_bytes_per_pixel = 0; |
| uint64_t g_bytes_per_row = 0; |
| uint64_t g_bytes_per_frame = 0; |
| |
| const char* // |
| read_stdin() { |
| while (g_src_len < SRC_BUFFER_ARRAY_SIZE) { |
| const int stdin_fd = 0; |
| ssize_t n = read(stdin_fd, g_src_buffer_array + g_src_len, |
| SRC_BUFFER_ARRAY_SIZE - g_src_len); |
| if (n > 0) { |
| g_src_len += n; |
| } else if (n == 0) { |
| return NULL; |
| } else if (errno == EINTR) { |
| // No-op. |
| } else { |
| return strerror(errno); |
| } |
| } |
| return "input is too large"; |
| } |
| |
| const char* // |
| process_png_chunks(uint8_t* p, size_t n) { |
| while (n > 0) { |
| // Process the 8 byte chunk header. |
| if (n < 8) { |
| return "invalid PNG chunk"; |
| } |
| uint32_t chunk_len = load_u32be(p + 0); |
| uint32_t chunk_type = load_u32be(p + 4); |
| p += 8; |
| n -= 8; |
| |
| // Process the chunk payload. |
| if (n < chunk_len) { |
| return "short PNG chunk data"; |
| } |
| switch (chunk_type) { |
| case 0x49484452: // "IHDR" |
| if (chunk_len != 13) { |
| return "invalid PNG IDAT chunk"; |
| } |
| g_width = load_u32be(p + 0); |
| g_height = load_u32be(p + 4); |
| if ((g_width == 0) || (g_height == 0)) { |
| return "image dimensions are too small"; |
| } |
| if ((g_width > MAX_DIMENSION) || (g_height > MAX_DIMENSION)) { |
| return "image dimensions are too large"; |
| } |
| if (p[8] != 8) { |
| return "unsupported PNG bit depth"; |
| } |
| if (g_bytes_per_pixel != 0) { |
| return "duplicate PNG IHDR chunk"; |
| } |
| // Process the color type, as per the PNG spec table 11.1. |
| switch (p[9]) { |
| case 0: |
| g_bytes_per_pixel = 1; |
| break; |
| case 2: |
| g_bytes_per_pixel = 3; |
| break; |
| case 3: |
| g_bytes_per_pixel = 1; |
| break; |
| case 4: |
| g_bytes_per_pixel = 2; |
| break; |
| case 6: |
| g_bytes_per_pixel = 4; |
| break; |
| default: |
| return "unsupported PNG color type"; |
| } |
| if (p[12] != 0) { |
| return "unsupported PNG interlacing"; |
| } |
| break; |
| |
| case 0x49444154: // "IDAT" |
| if (g_num_idat_chunks == MAX_IDAT_CHUNKS - 1) { |
| return "too many IDAT chunks"; |
| } |
| memcpy(g_idat_buffer_array + g_idat_splits[g_num_idat_chunks], p, |
| chunk_len); |
| g_idat_splits[g_num_idat_chunks + 1] = |
| g_idat_splits[g_num_idat_chunks] + chunk_len; |
| g_num_idat_chunks++; |
| break; |
| } |
| p += chunk_len; |
| n -= chunk_len; |
| |
| // Process (and ignore) the 4 byte chunk footer (a checksum). |
| if (n < 4) { |
| return "invalid PNG chunk"; |
| } |
| p += 4; |
| n -= 4; |
| } |
| return NULL; |
| } |
| |
| const char* // |
| decode_once(bool frag_dst, bool frag_idat) { |
| wuffs_zlib__decoder dec; |
| wuffs_base__status status = |
| wuffs_zlib__decoder__initialize(&dec, sizeof dec, WUFFS_VERSION, 0); |
| if (!wuffs_base__status__is_ok(&status)) { |
| return wuffs_base__status__message(&status); |
| } |
| |
| wuffs_base__io_buffer dst = ((wuffs_base__io_buffer){ |
| .data = ((wuffs_base__slice_u8){ |
| .ptr = g_dst_buffer_array, |
| .len = g_bytes_per_frame, |
| }), |
| }); |
| wuffs_base__io_buffer idat = ((wuffs_base__io_buffer){ |
| .data = ((wuffs_base__slice_u8){ |
| .ptr = g_idat_buffer_array, |
| .len = SRC_BUFFER_ARRAY_SIZE, |
| }), |
| .meta = ((wuffs_base__io_buffer_meta){ |
| .wi = g_idat_splits[g_num_idat_chunks], |
| .ri = 0, |
| .pos = 0, |
| .closed = true, |
| }), |
| }); |
| |
| uint32_t i = 0; // Number of dst fragments processed, if frag_dst. |
| if (frag_dst) { |
| dst.data.len = g_bytes_per_row; |
| } |
| |
| uint32_t j = 0; // Number of IDAT fragments processed, if frag_idat. |
| if (frag_idat) { |
| idat.meta.wi = g_idat_splits[1]; |
| idat.meta.closed = (g_num_idat_chunks == 1); |
| } |
| |
| while (true) { |
| status = |
| wuffs_zlib__decoder__transform_io(&dec, &dst, &idat, |
| ((wuffs_base__slice_u8){ |
| .ptr = g_work_buffer_array, |
| .len = WORK_BUFFER_ARRAY_SIZE, |
| })); |
| |
| if (wuffs_base__status__is_ok(&status)) { |
| break; |
| } |
| if ((status.repr == wuffs_base__suspension__short_write) && frag_dst && |
| (i < g_height - 1)) { |
| i++; |
| dst.data.len = g_bytes_per_row * (i + 1); |
| continue; |
| } |
| if ((status.repr == wuffs_base__suspension__short_read) && frag_idat && |
| (j < g_num_idat_chunks - 1)) { |
| j++; |
| idat.meta.wi = g_idat_splits[j + 1]; |
| idat.meta.closed = (g_num_idat_chunks == j + 1); |
| continue; |
| } |
| return wuffs_base__status__message(&status); |
| } |
| |
| if (dst.meta.wi != g_bytes_per_frame) { |
| return "unexpected number of bytes decoded"; |
| } |
| return NULL; |
| } |
| |
| const char* // |
| decode(bool frag_dst, bool frag_idat) { |
| int reps; |
| if (g_bytes_per_frame < 100000) { |
| reps = 1000; |
| } else if (g_bytes_per_frame < 1000000) { |
| reps = 100; |
| } else if (g_bytes_per_frame < 10000000) { |
| reps = 10; |
| } else { |
| reps = 1; |
| } |
| |
| struct timeval bench_start_tv; |
| gettimeofday(&bench_start_tv, NULL); |
| |
| for (int i = 0; i < reps; i++) { |
| const char* msg = decode_once(frag_dst, frag_idat); |
| if (msg) { |
| return msg; |
| } |
| } |
| |
| struct timeval bench_finish_tv; |
| gettimeofday(&bench_finish_tv, NULL); |
| int64_t micros = |
| (int64_t)(bench_finish_tv.tv_sec - bench_start_tv.tv_sec) * 1000000 + |
| (int64_t)(bench_finish_tv.tv_usec - bench_start_tv.tv_usec); |
| uint64_t nanos = 1; |
| if (micros > 0) { |
| nanos = (uint64_t)(micros)*1000; |
| } |
| |
| printf("Benchmark%sDst%sIDAT/%s\t%8d\t%8" PRIu64 " ns/op\n", |
| frag_dst ? "Frag" : "Full", // |
| frag_idat ? "Frag" : "Full", // |
| g_cc, reps, nanos / reps); |
| |
| return NULL; |
| } |
| |
| int // |
| fail(const char* msg) { |
| const int stderr_fd = 2; |
| write(stderr_fd, msg, strnlen(msg, 4095)); |
| write(stderr_fd, "\n", 1); |
| return 1; |
| } |
| |
| int // |
| main(int argc, char** argv) { |
| const char* msg = read_stdin(); |
| if (msg) { |
| return fail(msg); |
| } |
| if ((g_src_len < 8) || strncmp((const char*)(g_src_buffer_array), |
| "\x89PNG\x0D\x0A\x1A\x0A", 8)) { |
| return fail("invalid PNG"); |
| } |
| msg = process_png_chunks(g_src_buffer_array + 8, g_src_len - 8); |
| if (msg) { |
| return fail(msg); |
| } |
| if (g_bytes_per_pixel == 0) { |
| return fail("missing PNG IHDR chunk"); |
| } |
| if (g_num_idat_chunks == 0) { |
| return fail("missing PNG IDAT chunk"); |
| } |
| // The +1 here is for the per-row filter byte. |
| g_bytes_per_row = (uint64_t)g_width * g_bytes_per_pixel + 1; |
| g_bytes_per_frame = (uint64_t)g_height * g_bytes_per_row; |
| if (g_bytes_per_frame > DST_BUFFER_ARRAY_SIZE) { |
| return fail("decompressed data is too large"); |
| } |
| |
| printf("# %s version %s\n#\n", g_cc, g_cc_version); |
| printf( |
| "# The output format, including the \"Benchmark\" prefixes, is " |
| "compatible with the\n" |
| "# https://godoc.org/golang.org/x/perf/cmd/benchstat tool. To install " |
| "it, first\n" |
| "# install Go, then run \"go get golang.org/x/perf/cmd/benchstat\".\n"); |
| |
| for (int i = 0; i < 5; i++) { |
| msg = decode(true, true); |
| if (msg) { |
| return fail(msg); |
| } |
| msg = decode(true, false); |
| if (msg) { |
| return fail(msg); |
| } |
| msg = decode(false, true); |
| if (msg) { |
| return fail(msg); |
| } |
| msg = decode(false, false); |
| if (msg) { |
| return fail(msg); |
| } |
| } |
| |
| return 0; |
| } |