Add etc_copy_u32_from_history_8_byte_chunks_fast
On a mid-range x86_64 laptop:
----
$ ls -lh linux-5.11.3.tar.gz | awk '{print $5 " " $9}'
178M linux-5.11.3.tar.gz
$ # Debian's /usr/bin/zcat
$ time /usr/bin/zcat < linux-5.11.3.tar.gz > /dev/null
real 0m8.423s
$ # Wuffs; before
$ time gen/bin/example-zcat < linux-5.11.3.tar.gz > /dev/null
real 0m4.054s
$ # Wuffs; after
$ time gen/bin/example-zcat < linux-5.11.3.tar.gz > /dev/null
real 0m2.651s
----
Wuffs benchmarks:
wuffs_deflate_decode_1k_full_init/clang9 95.6MB/s ± 0% 98.1MB/s ± 1% +2.62% (p=0.008 n=5+5)
wuffs_deflate_decode_1k_part_init/clang9 109MB/s ± 0% 112MB/s ± 0% +3.31% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/clang9 162MB/s ± 0% 201MB/s ± 0% +24.32% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/clang9 165MB/s ± 0% 206MB/s ± 0% +25.16% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/clang9 170MB/s ± 0% 253MB/s ± 0% +48.63% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/clang9 147MB/s ± 0% 160MB/s ± 0% +8.77% (p=0.016 n=4+5)
wuffs_deflate_decode_1k_full_init/gcc10 92.1MB/s ± 0% 91.7MB/s ± 0% -0.44% (p=0.016 n=5+5)
wuffs_deflate_decode_1k_part_init/gcc10 107MB/s ± 0% 104MB/s ± 0% -3.35% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/gcc10 149MB/s ± 0% 195MB/s ± 0% +30.68% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/gcc10 152MB/s ± 1% 199MB/s ± 1% +31.23% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/gcc10 213MB/s ± 0% 256MB/s ± 0% +20.42% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/gcc10 177MB/s ± 0% 177MB/s ± 1% ~ (p=0.595 n=5+5)
mimic_deflate_decode_1k 128MB/s ± 0% 128MB/s ± 0% +0.42% (p=0.016 n=5+4)
mimic_deflate_decode_10k 143MB/s ± 0% 143MB/s ± 0% ~ (p=0.310 n=5+5)
mimic_deflate_decode_100k_just_one_read 176MB/s ± 0% 176MB/s ± 0% -0.24% (p=0.008 n=5+5)
mimic_deflate_decode_100k_many_big_reads 135MB/s ± 0% 135MB/s ± 0% +0.12% (p=0.016 n=5+5)
----
wuffs_gzip_decode_10k/clang9 166MB/s ± 0% 198MB/s ± 1% +19.66% (p=0.008 n=5+5)
wuffs_gzip_decode_100k/clang9 178MB/s ± 0% 241MB/s ± 0% +35.41% (p=0.008 n=5+5)
wuffs_gzip_decode_10k/gcc10 172MB/s ± 0% 199MB/s ± 0% +15.67% (p=0.008 n=5+5)
wuffs_gzip_decode_100k/gcc10 198MB/s ± 0% 258MB/s ± 0% +29.95% (p=0.008 n=5+5)
mimic_gzip_decode_10k 121MB/s ± 0% 121MB/s ± 0% ~ (p=0.421 n=5+5)
mimic_gzip_decode_100k 144MB/s ± 0% 144MB/s ± 0% ~ (p=0.548 n=5+5)
----
wuffs_zlib_decode_10k/clang9 154MB/s ± 0% 199MB/s ± 0% +28.89% (p=0.016 n=4+5)
wuffs_zlib_decode_100k/clang9 186MB/s ± 1% 234MB/s ± 1% +25.58% (p=0.008 n=5+5)
wuffs_zlib_decode_10k/gcc10 156MB/s ± 0% 202MB/s ± 0% +30.01% (p=0.008 n=5+5)
wuffs_zlib_decode_100k/gcc10 198MB/s ± 0% 262MB/s ± 0% +32.21% (p=0.008 n=5+5)
mimic_zlib_decode_10k 133MB/s ± 0% 133MB/s ± 0% ~ (p=1.000 n=5+5)
mimic_zlib_decode_100k 160MB/s ± 0% 160MB/s ± 0% ~ (p=0.095 n=5+5)
----
wuffs_png_decode_image_19k_8bpp/clang9 115MB/s ± 0% 131MB/s ± 0% +14.45% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang9 153MB/s ± 0% 153MB/s ± 0% -0.51% (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/clang9 415MB/s ± 0% 474MB/s ± 0% +14.37% (p=0.016 n=4+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 345MB/s ± 1% 371MB/s ± 0% +7.62% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 335MB/s ± 0% 359MB/s ± 0% +7.06% (p=0.016 n=4+5)
wuffs_png_decode_image_4002k_24bpp/clang9 153MB/s ± 0% 156MB/s ± 0% +2.37% (p=0.008 n=5+5)
wuffs_png_decode_image_19k_8bpp/gcc10 123MB/s ± 0% 135MB/s ± 1% +9.38% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc10 159MB/s ± 1% 163MB/s ± 0% +2.60% (p=0.016 n=5+4)
wuffs_png_decode_image_77k_8bpp/gcc10 452MB/s ± 0% 479MB/s ± 0% +6.00% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 353MB/s ± 0% 387MB/s ± 0% +9.42% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 341MB/s ± 0% 372MB/s ± 0% +9.07% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc10 161MB/s ± 0% 164MB/s ± 0% +1.86% (p=0.016 n=4+5)
mimic_png_decode_image_19k_8bpp 58.1MB/s ± 0% 58.2MB/s ± 0% +0.22% (p=0.008 n=5+5)
mimic_png_decode_image_40k_24bpp 73.2MB/s ± 0% 73.2MB/s ± 0% ~ (p=0.341 n=5+5)
mimic_png_decode_image_77k_8bpp 177MB/s ± 0% 177MB/s ± 0% ~ (p=0.095 n=5+5)
mimic_png_decode_image_552k_32bpp_ignore_checksum skipped
mimic_png_decode_image_552k_32bpp_verify_checksum 147MB/s ± 0% 146MB/s ± 0% ~ (p=0.056 n=5+5)
mimic_png_decode_image_4002k_24bpp 104MB/s ± 0% 104MB/s ± 0% ~ (p=0.841 n=4+5)
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index 77e25a9..11b6274 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -204,7 +204,9 @@
// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the
// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has
-// stronger pre-conditions. The caller needs to prove that:
+// stronger pre-conditions.
+//
+// The caller needs to prove that:
// - distance > 0
// - distance <= (*ptr_iop_w - io1_w)
// - length <= (io2_w - *ptr_iop_w)
@@ -229,6 +231,44 @@
return length;
}
+// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is
+// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function
+// above, but copies 8 byte chunks at a time.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+// - distance > 0
+// - distance <= (*ptr_iop_w - io1_w)
+// - (length + 8) <= (io2_w - *ptr_iop_w)
+static inline uint32_t //
+wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
+ uint8_t** ptr_iop_w,
+ uint8_t* io1_w,
+ uint8_t* io2_w,
+ uint32_t length,
+ uint32_t distance) {
+ uint8_t* p = *ptr_iop_w;
+ uint8_t* q = p - distance;
+ uint32_t n = length;
+ while (1) {
+ memcpy(p, q, 8);
+ if (n <= 8) {
+ p += n;
+ break;
+ }
+ p += 8;
+ q += 8;
+ n -= 8;
+ }
+ *ptr_iop_w = p;
+ return length;
+}
+
static inline uint32_t //
wuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,
uint8_t* io2_w,
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index be32c70..cb10c1d 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -307,13 +307,11 @@
}
switch method {
- case t.IDLimitedCopyU32FromHistory, t.IDLimitedCopyU32FromHistoryFast:
- suffix := ""
- if method == t.IDLimitedCopyU32FromHistoryFast {
- suffix = "_fast"
- }
- b.printf("wuffs_base__io_writer__limited_copy_u32_from_history%s(\n&%s%s, %s%s, %s%s",
- suffix, iopPrefix, recvName, io0Prefix, recvName, io2Prefix, recvName)
+ case t.IDLimitedCopyU32FromHistory,
+ t.IDLimitedCopyU32FromHistory8ByteChunksFast,
+ t.IDLimitedCopyU32FromHistoryFast:
+ b.printf("wuffs_base__io_writer__%s(\n&%s%s, %s%s, %s%s",
+ method.Str(g.tm), iopPrefix, recvName, io0Prefix, recvName, io2Prefix, recvName)
for _, o := range args {
b.writes(", ")
if err := g.writeExpr(b, o.AsArg().Value(), false, depth); err != nil {
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 535b79a..1c91fcc 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -212,9 +212,11 @@
"" +
"// --------\n\nstatic inline uint64_t //\nwuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n wuffs_base__slice_u8 src) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = src.len;\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n if (n > 0) {\n memmove(iop_w, src.ptr, n);\n *ptr_iop_w += n;\n }\n return (uint64_t)(n);\n}\n\nstatic inline void //\nwuffs_base__io_writer__limit(uint8_t** ptr_io2_w,\n uint8_t* iop_w,\n uint64_t limit) {\n if (((uint64_t)(*ptr_io2_w - iop_w)) > limit) {\n *ptr_io2_w = iop_w + limit;\n }\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history(uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n " +
" uint32_t distance) {\n if (!distance) {\n return 0;\n }\n uint8_t* p = *ptr_iop_w;\n if ((size_t)(p - io1_w) < (size_t)(distance)) {\n return 0;\n }\n uint8_t* q = p - distance;\n size_t n = (size_t)(io2_w - p);\n if ((size_t)(length) > n) {\n length = (uint32_t)(n);\n } else {\n n = (size_t)(length);\n }\n // TODO: unrolling by 3 seems best for the std/deflate benchmarks, but that\n // is mostly because 3 is the minimum length for the deflate format. This\n // function implementation shouldn't overfit to that one format. Perhaps the\n // limited_copy_u32_from_history Wuffs method should also take an unroll hint\n // argument, and the cgen can look if that argument is the constant\n // expression '3'.\n //\n // See also wuffs_base__io_writer__limited_copy_u32_from_history_fast below.\n //\n // Alternatively or additionally, have a sloppy_limited_copy_u32_from_history\n // method that copies 8 bytes at a time, which can more than length bytes?\n for (; n >= 3; " +
- "n -= 3) {\n *p++ = *q++;\n *p++ = *q++;\n *p++ = *q++;\n }\n for (; n; n--) {\n *p++ = *q++;\n }\n *ptr_iop_w = p;\n return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions. The caller needs to prove that:\n// - distance > 0\n// - distance <= (*ptr_iop_w - io1_w)\n// - length <= (io2_w - *ptr_iop_w)\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n uint32_t distance) {\n uint8_t* p = *ptr_iop_w;\n uint8_t* q = p - distance;\n uint32_t n = length;\n for (; n >= 3; n -= 3) {\n *p++ = *q++;\n *p++ = *q++;\n *p++ = *q++;\n" +
- " }\n for (; n; n--) {\n *p++ = *q++;\n }\n *ptr_iop_w = p;\n return length;\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n uint32_t length,\n const uint8_t** ptr_iop_r,\n const uint8_t* io2_r) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = length;\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n const uint8_t* iop_r = *ptr_iop_r;\n if (n > ((size_t)(io2_r - iop_r))) {\n n = (size_t)(io2_r - iop_r);\n }\n if (n > 0) {\n memmove(iop_w, iop_r, n);\n *ptr_iop_w += n;\n *ptr_iop_r += n;\n }\n return (uint32_t)(n);\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_slice(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n " +
- " uint32_t length,\n wuffs_base__slice_u8 src) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = src.len;\n if (n > length) {\n n = length;\n }\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n if (n > 0) {\n memmove(iop_w, src.ptr, n);\n *ptr_iop_w += n;\n }\n return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer* //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n uint8_t** ptr_iop_w,\n uint8_t** ptr_io0_w,\n uint8_t** ptr_io1_w,\n uint8_t** ptr_io2_w,\n wuffs_base__slice_u8 data) {\n b->data = data;\n b->meta.wi = 0;\n b->meta.ri = 0;\n b->meta.pos = 0;\n b->meta.closed = false;\n\n *ptr_iop_w = data.ptr;\n *ptr_io0_w = data.ptr;\n *ptr_io1_w = data.ptr;\n *ptr_io2_w = data.ptr + data.len;\n\n return b;\n}\n\n" +
+ "n -= 3) {\n *p++ = *q++;\n *p++ = *q++;\n *p++ = *q++;\n }\n for (; n; n--) {\n *p++ = *q++;\n }\n *ptr_iop_w = p;\n return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions.\n//\n// The caller needs to prove that:\n// - distance > 0\n// - distance <= (*ptr_iop_w - io1_w)\n// - length <= (io2_w - *ptr_iop_w)\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n uint32_t distance) {\n uint8_t* p = *ptr_iop_w;\n uint8_t* q = p - distance;\n uint32_t n = length;\n for (; n >= 3; n -= 3) {\n *p++ = *q++;\n *p++ = *q++;\n *p++ = " +
+ "*q++;\n }\n for (; n; n--) {\n *p++ = *q++;\n }\n *ptr_iop_w = p;\n return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is\n// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function\n// above, but copies 8 byte chunks at a time.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n// - distance > 0\n// - distance <= (*ptr_iop_w - io1_w)\n// - (length + 8) <= (io2_w - *ptr_iop_w)\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(\n uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n uint32_t distance) {\n uint8_t* p = *ptr_iop_w;\n uint8_t* q = p - distance;\n uint32_t" +
+ " n = length;\n while (1) {\n memcpy(p, q, 8);\n if (n <= 8) {\n p += n;\n break;\n }\n p += 8;\n q += 8;\n n -= 8;\n }\n *ptr_iop_w = p;\n return length;\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n uint32_t length,\n const uint8_t** ptr_iop_r,\n const uint8_t* io2_r) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = length;\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n const uint8_t* iop_r = *ptr_iop_r;\n if (n > ((size_t)(io2_r - iop_r))) {\n n = (size_t)(io2_r - iop_r);\n }\n if (n > 0) {\n memmove(iop_w, iop_r, n);\n *ptr_iop_w += n;\n *ptr_iop_r += n;\n }\n return (uint32_t)(n);\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_slice(uint8_t** ptr_iop_" +
+ "w,\n uint8_t* io2_w,\n uint32_t length,\n wuffs_base__slice_u8 src) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = src.len;\n if (n > length) {\n n = length;\n }\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n if (n > 0) {\n memmove(iop_w, src.ptr, n);\n *ptr_iop_w += n;\n }\n return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer* //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n uint8_t** ptr_iop_w,\n uint8_t** ptr_io0_w,\n uint8_t** ptr_io1_w,\n uint8_t** ptr_io2_w,\n wuffs_base__slice_u8 data) {\n b->data = data;\n b->meta.wi = 0;\n b->meta.ri = 0;\n b->meta.pos = 0;\n b->meta.closed = false;\n\n *ptr_iop_w = data.ptr;\n *ptr_io0_w = data.ptr;\n *ptr_io1_w = data.ptr;\n *ptr_io2_w = data.pt" +
+ "r + data.len;\n\n return b;\n}\n\n" +
"" +
"// ---------------- I/O (Utility)\n\n#define wuffs_base__utility__empty_io_reader wuffs_base__empty_io_reader\n#define wuffs_base__utility__empty_io_writer wuffs_base__empty_io_writer\n" +
""
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 375014e..487a4b0 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -538,10 +538,17 @@
// TODO: this should have explicit pre-conditions:
// - up_to <= this.length()
// - distance > 0
- // - distance <= this.since_mark().length()
+ // - distance <= this.history_length()
// For now, that's all implicitly checked (i.e. hard coded).
"io_writer.limited_copy_u32_from_history_fast!(up_to: u32, distance: u32) u32",
+ // TODO: this should have explicit pre-conditions:
+ // - (up_to + 8) <= this.length()
+ // - distance > 0
+ // - distance <= this.history_length()
+ // For now, that's all implicitly checked (i.e. hard coded).
+ "io_writer.limited_copy_u32_from_history_8_byte_chunks_fast!(up_to: u32, distance: u32) u32",
+
// ---- token_writer
"token_writer.write_simple_token_fast!(" +
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index 24df237..2fa3044 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1167,8 +1167,13 @@
return bounds{}, err
}
+ } else if method == t.IDLimitedCopyU32FromHistory8ByteChunksFast {
+ if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight); err != nil {
+ return bounds{}, err
+ }
+
} else if method == t.IDLimitedCopyU32FromHistoryFast {
- if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args()); err != nil {
+ if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), nil); err != nil {
return bounds{}, err
}
@@ -1281,11 +1286,13 @@
return fmt.Errorf("check: could not prove %s.can_undo_byte()", recv.Str(q.tm))
}
-func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node) error {
+func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node, adj *big.Int) error {
// As per cgen's io-private.h, there are three pre-conditions:
- // - upTo <= this.length()
+ // - (upTo + adj) <= this.length()
// - distance > 0
// - distance <= this.history_length()
+ //
+ // adj may be nil, in which case (upTo + adj) is just upTo.
if len(args) != 2 {
return fmt.Errorf("check: internal error: inconsistent limited_copy_u32_from_history_fast arguments")
@@ -1301,15 +1308,26 @@
continue
}
- // Check that the LHS is "upTo as base.u64".
+ // Check that the LHS is "(upTo + adj) as base.u64".
lhs := x.LHS().AsExpr()
if lhs.Operator() != t.IDXBinaryAs {
continue
}
llhs, lrhs := lhs.LHS().AsExpr(), lhs.RHS().AsTypeExpr()
- if !llhs.Eq(upTo) || !lrhs.Eq(typeExprU64) {
+ if !lrhs.Eq(typeExprU64) {
continue
}
+ if adj == nil {
+ if !llhs.Eq(upTo) {
+ continue
+ }
+ } else {
+ if (llhs.Operator() != t.IDXBinaryPlus) || !llhs.LHS().AsExpr().Eq(upTo) {
+ continue
+ } else if cv := llhs.RHS().AsExpr().ConstValue(); (cv == nil) || (cv.Cmp(adj) != 0) {
+ continue
+ }
+ }
// Check that the RHS is "recv.length()".
y, method, yArgs := splitReceiverMethodArgs(x.RHS().AsExpr())
@@ -1322,7 +1340,12 @@
break check0
}
- return fmt.Errorf("check: could not prove (up_to as base.u64) <= %s.length()", recv.Str(q.tm))
+ if adj == nil {
+ return fmt.Errorf("check: could not prove (%s as base.u64) <= %s.length()",
+ upTo.Str(q.tm), recv.Str(q.tm))
+ }
+ return fmt.Errorf("check: could not prove ((%s + %v) as base.u64) <= %s.length()",
+ upTo.Str(q.tm), adj, recv.Str(q.tm))
}
// Check "distance > 0".
@@ -1340,7 +1363,7 @@
}
break check1
}
- return fmt.Errorf("check: could not prove distance > 0")
+ return fmt.Errorf("check: could not prove %s > 0", distance.Str(q.tm))
}
// Check "distance <= this.history_length()".
@@ -1372,7 +1395,8 @@
break check2
}
- return fmt.Errorf("check: could not prove distance <= %s.history_length()", recv.Str(q.tm))
+ return fmt.Errorf("check: could not prove %s <= %s.history_length()",
+ distance.Str(q.tm), recv.Str(q.tm))
}
return nil
diff --git a/lang/token/list.go b/lang/token/list.go
index 7cfbde7..797f983 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -504,12 +504,13 @@
IDSkipU32 = ID(0x16B)
IDSkipU32Fast = ID(0x16C)
- IDCopyFromSlice = ID(0x170)
- IDLimitedCopyU32FromHistory = ID(0x171)
- IDLimitedCopyU32FromHistoryFast = ID(0x172)
- IDLimitedCopyU32FromReader = ID(0x173)
- IDLimitedCopyU32FromSlice = ID(0x174)
- IDLimitedCopyU32ToSlice = ID(0x175)
+ IDCopyFromSlice = ID(0x170)
+ IDLimitedCopyU32FromHistory = ID(0x171)
+ IDLimitedCopyU32FromHistory8ByteChunksFast = ID(0x172)
+ IDLimitedCopyU32FromHistoryFast = ID(0x173)
+ IDLimitedCopyU32FromReader = ID(0x174)
+ IDLimitedCopyU32FromSlice = ID(0x175)
+ IDLimitedCopyU32ToSlice = ID(0x176)
// -------- 0x180 block.
@@ -927,12 +928,13 @@
IDSkipU32: "skip_u32",
IDSkipU32Fast: "skip_u32_fast",
- IDCopyFromSlice: "copy_from_slice",
- IDLimitedCopyU32FromHistory: "limited_copy_u32_from_history",
- IDLimitedCopyU32FromHistoryFast: "limited_copy_u32_from_history_fast",
- IDLimitedCopyU32FromReader: "limited_copy_u32_from_reader",
- IDLimitedCopyU32FromSlice: "limited_copy_u32_from_slice",
- IDLimitedCopyU32ToSlice: "limited_copy_u32_to_slice",
+ IDCopyFromSlice: "copy_from_slice",
+ IDLimitedCopyU32FromHistory: "limited_copy_u32_from_history",
+ IDLimitedCopyU32FromHistory8ByteChunksFast: "limited_copy_u32_from_history_8_byte_chunks_fast",
+ IDLimitedCopyU32FromHistoryFast: "limited_copy_u32_from_history_fast",
+ IDLimitedCopyU32FromReader: "limited_copy_u32_from_reader",
+ IDLimitedCopyU32FromSlice: "limited_copy_u32_from_slice",
+ IDLimitedCopyU32ToSlice: "limited_copy_u32_to_slice",
// -------- 0x180 block.
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b0f7e8a..d7e2bda 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -10055,7 +10055,9 @@
// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the
// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has
-// stronger pre-conditions. The caller needs to prove that:
+// stronger pre-conditions.
+//
+// The caller needs to prove that:
// - distance > 0
// - distance <= (*ptr_iop_w - io1_w)
// - length <= (io2_w - *ptr_iop_w)
@@ -10080,6 +10082,44 @@
return length;
}
+// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is
+// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function
+// above, but copies 8 byte chunks at a time.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+// - distance > 0
+// - distance <= (*ptr_iop_w - io1_w)
+// - (length + 8) <= (io2_w - *ptr_iop_w)
+static inline uint32_t //
+wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
+ uint8_t** ptr_iop_w,
+ uint8_t* io1_w,
+ uint8_t* io2_w,
+ uint32_t length,
+ uint32_t distance) {
+ uint8_t* p = *ptr_iop_w;
+ uint8_t* q = p - distance;
+ uint32_t n = length;
+ while (1) {
+ memcpy(p, q, 8);
+ if (n <= 8) {
+ p += n;
+ break;
+ }
+ p += 8;
+ q += 8;
+ n -= 8;
+ }
+ *ptr_iop_w = p;
+ return length;
+}
+
static inline uint32_t //
wuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,
uint8_t* io2_w,
@@ -24151,7 +24191,7 @@
v_lmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
v_dmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
label__loop__continue:;
- while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
+ while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 266) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63)));
iop_a_src += ((63 - (v_n_bits & 63)) >> 3);
v_n_bits |= 56;
@@ -24250,14 +24290,18 @@
if (v_length == 0) {
goto label__loop__continue;
}
- if ((((uint64_t)((v_dist_minus_1 + 1))) > ((uint64_t)(iop_a_dst - io0_a_dst))) || (((uint64_t)(v_length)) > ((uint64_t)(io2_a_dst - iop_a_dst)))) {
+ if ((((uint64_t)((v_dist_minus_1 + 1))) > ((uint64_t)(iop_a_dst - io0_a_dst))) || (((uint64_t)(v_length)) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)((v_length + 8))) > ((uint64_t)(io2_a_dst - iop_a_dst)))) {
status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_distance);
goto exit;
}
- } else {
}
- wuffs_base__io_writer__limited_copy_u32_from_history_fast(
- &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+ if ((v_dist_minus_1 + 1) >= 8) {
+ wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
+ &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+ } else {
+ wuffs_base__io_writer__limited_copy_u32_from_history_fast(
+ &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+ }
goto label__0__break;
}
label__0__break:;
diff --git a/std/deflate/decode_huffman_fast64.wuffs b/std/deflate/decode_huffman_fast64.wuffs
index d714b70..9e0e8e2 100644
--- a/std/deflate/decode_huffman_fast64.wuffs
+++ b/std/deflate/decode_huffman_fast64.wuffs
@@ -44,7 +44,7 @@
dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1
// Check up front, on each iteration, that we have enough buffer space to
- // both read (8 bytes) and write (258 bytes) as much as we need to. Doing
+ // both read (8 bytes) and write (266 bytes) as much as we need to. Doing
// this check once (per iteration), up front, removes the need to check
// multiple times inside the loop body, so it's faster overall.
//
@@ -52,12 +52,18 @@
// 258 is the maximum length in a length-distance pair, as specified in the
// RFC section 3.2.5. Compressed blocks (length and distance codes).
//
+ // We adjust 258 up to (258 + 8) = 266 since it can be faster to overshoot
+ // a little and make multiple-of-8-byte copies even when the length in the
+ // length-distance pair isn't an exact multiple-of-8. Strictly speaking,
+ // 264 (an exact multiple-of-8) is the tight bound but (258 + 8) is easier
+ // for the Wuffs proof system, as length's type is refined to [..= 258],
+ //
// For reading, strictly speaking, we only need 6 bytes (48 bits) of
// available input, because the H-L Literal/Length code is up to 15 bits
// plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up
// to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to
// read 64 bits than 48 bits or (48 - n_bits) bits.
- while.loop(args.dst.length() >= 258) and (args.src.length() >= 8) {
+ while.loop(args.dst.length() >= 266) and (args.src.length() >= 8) {
// Ensure that we have at least 56 bits of input.
//
// This is "Variant 4" of
@@ -178,8 +184,12 @@
// its presence minimizes the diff between decode_huffman_fastxx and
// decode_huffman_slow.
while true,
- pre args.dst.length() >= 258,
+ pre args.dst.length() >= 266,
{
+ // We can therefore prove:
+ assert (length as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266)
+ assert ((length + 8) as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266)
+
// Copy from this.history.
if ((dist_minus_1 + 1) as base.u64) > args.dst.history_length() {
// Set (hlen, hdist) to be the length-distance pair to copy
@@ -217,23 +227,33 @@
}
if (((dist_minus_1 + 1) as base.u64) > args.dst.history_length()) or
- ((length as base.u64) > args.dst.length()) {
+ ((length as base.u64) > args.dst.length()) or
+ (((length + 8) as base.u64) > args.dst.length()) {
return "#internal error: inconsistent distance"
}
- } else {
- assert (length as base.u64) <= 258
- assert args.dst.length() >= 258
- // We can therefore prove:
- assert (length as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 258)
}
// Once again, redundant but explicit assertions.
assert (dist_minus_1 + 1) > 0
assert ((dist_minus_1 + 1) as base.u64) <= args.dst.history_length()
assert (length as base.u64) <= args.dst.length()
+ assert ((length + 8) as base.u64) <= args.dst.length()
// Copy from args.dst.
- args.dst.limited_copy_u32_from_history_fast!(
- up_to: length, distance: (dist_minus_1 + 1))
+ //
+ // For short distances, less than 8 bytes, copying atomic 8-byte
+ // chunks can result in incorrect output, so we fall back to a
+ // slower 1-byte-at-a-time copy. Deflate's copy-from-history can
+ // pick up freshly written bytes. A length = 5, distance = 2 copy
+ // starting with "abc" should give "abcbcbcb" but the 8-byte chunk
+ // technique would give "abcbc???", the exact output depending on
+ // what was previously in the writer buffer.
+ if (dist_minus_1 + 1) >= 8 {
+ args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!(
+ up_to: length, distance: (dist_minus_1 + 1))
+ } else {
+ args.dst.limited_copy_u32_from_history_fast!(
+ up_to: length, distance: (dist_minus_1 + 1))
+ }
break
} endwhile
} endwhile.loop