Optimize std/deflate decoding for RLE sections
name old speed new speed delta
wuffs_deflate_decode_1k_full_init/clang9 104MB/s ± 1% 104MB/s ± 1% ~ (p=0.161 n=9+9)
wuffs_deflate_decode_1k_part_init/clang9 121MB/s ± 1% 119MB/s ± 1% -0.92% (p=0.000 n=9+9)
wuffs_deflate_decode_10k_full_init/clang9 223MB/s ± 1% 223MB/s ± 1% ~ (p=0.796 n=9+9)
wuffs_deflate_decode_10k_part_init/clang9 230MB/s ± 1% 228MB/s ± 2% ~ (p=0.065 n=10+9)
wuffs_deflate_decode_100k_just_one_read/clang9 279MB/s ± 1% 292MB/s ± 0% +4.78% (p=0.000 n=9+8)
wuffs_deflate_decode_100k_many_big_reads/clang9 178MB/s ± 5% 189MB/s ± 2% +6.41% (p=0.000 n=10+10)
wuffs_deflate_decode_1k_full_init/gcc10 103MB/s ± 1% 102MB/s ± 1% -0.57% (p=0.022 n=10+9)
wuffs_deflate_decode_1k_part_init/gcc10 122MB/s ± 1% 121MB/s ± 3% -1.46% (p=0.000 n=10+9)
wuffs_deflate_decode_10k_full_init/gcc10 235MB/s ± 1% 231MB/s ± 1% -1.34% (p=0.000 n=10+8)
wuffs_deflate_decode_10k_part_init/gcc10 242MB/s ± 1% 239MB/s ± 0% -1.40% (p=0.001 n=9+8)
wuffs_deflate_decode_100k_just_one_read/gcc10 300MB/s ± 4% 297MB/s ± 2% ~ (p=0.156 n=10+9)
wuffs_deflate_decode_100k_many_big_reads/gcc10 193MB/s ± 2% 197MB/s ± 1% +2.29% (p=0.000 n=10+9)
wuffs_gzip_decode_10k/clang9 220MB/s ± 1% 219MB/s ± 1% ~ (p=0.605 n=9+9)
wuffs_gzip_decode_100k/clang9 269MB/s ± 2% 283MB/s ± 2% +4.98% (p=0.000 n=9+10)
wuffs_gzip_decode_10k/gcc10 227MB/s ± 1% 224MB/s ± 1% -1.42% (p=0.000 n=10+9)
wuffs_gzip_decode_100k/gcc10 283MB/s ± 1% 285MB/s ± 4% ~ (p=0.182 n=9+10)
wuffs_zlib_decode_10k/clang9 226MB/s ± 1% 220MB/s ± 4% -2.66% (p=0.000 n=9+10)
wuffs_zlib_decode_100k/clang9 272MB/s ± 2% 285MB/s ± 1% +4.77% (p=0.000 n=10+8)
wuffs_zlib_decode_10k/gcc10 227MB/s ± 1% 212MB/s ± 1% -6.80% (p=0.000 n=10+9)
wuffs_zlib_decode_100k/gcc10 282MB/s ± 1% 273MB/s ± 0% -3.32% (p=0.000 n=9+8)
wuffs_png_decode_image_19k_8bpp/clang9 148MB/s ± 1% 138MB/s ± 2% -6.79% (p=0.000 n=9+10)
wuffs_png_decode_image_40k_24bpp/clang9 170MB/s ± 1% 171MB/s ± 2% ~ (p=0.133 n=9+10)
wuffs_png_decode_image_77k_8bpp/clang9 525MB/s ± 1% 495MB/s ± 4% -5.81% (p=0.000 n=8+9)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 400MB/s ± 2% 457MB/s ± 4% +14.16% (p=0.000 n=9+9)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 388MB/s ± 1% 435MB/s ± 5% +12.17% (p=0.000 n=9+10)
wuffs_png_decode_image_4002k_24bpp/clang9 175MB/s ± 1% 172MB/s ± 2% -1.78% (p=0.005 n=10+10)
wuffs_png_decode_image_19k_8bpp/gcc10 154MB/s ± 3% 157MB/s ± 1% +1.72% (p=0.001 n=9+9)
wuffs_png_decode_image_40k_24bpp/gcc10 182MB/s ± 1% 187MB/s ± 1% +2.56% (p=0.000 n=9+10)
wuffs_png_decode_image_77k_8bpp/gcc10 552MB/s ± 3% 554MB/s ± 1% ~ (p=0.863 n=9+9)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 426MB/s ± 2% 502MB/s ± 1% +17.85% (p=0.000 n=10+10)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 410MB/s ± 2% 481MB/s ± 1% +17.17% (p=0.000 n=9+8)
wuffs_png_decode_image_4002k_24bpp/gcc10 186MB/s ± 1% 189MB/s ± 2% +1.33% (p=0.001 n=9+10)
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index bc180d0..c275fee 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -228,6 +228,46 @@
return length;
}
+// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast
+// copies the previous byte (the one immediately before *ptr_iop_w), copying 8
+// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+// - (length + 8) <= (io2_w - *ptr_iop_w)
+// - distance == 1
+// - distance <= (*ptr_iop_w - io1_w)
+static inline uint32_t //
+wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(
+ uint8_t** ptr_iop_w,
+ uint8_t* io1_w,
+ uint8_t* io2_w,
+ uint32_t length,
+ uint32_t distance) {
+ uint8_t* p = *ptr_iop_w;
+ uint64_t x = p[-1];
+ x |= x << 8;
+ x |= x << 16;
+ x |= x << 32;
+ uint32_t n = length;
+ while (1) {
+ wuffs_base__poke_u64le__no_bounds_check(p, x);
+ if (n <= 8) {
+ p += n;
+ break;
+ }
+ p += 8;
+ n -= 8;
+ }
+ *ptr_iop_w = p;
+ return length;
+}
+
// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is
// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function
// above, but copies 8 byte chunks at a time.
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index c1c4300..b0e4d1b 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -308,6 +308,7 @@
switch method {
case t.IDLimitedCopyU32FromHistory,
+ t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast,
t.IDLimitedCopyU32FromHistory8ByteChunksFast,
t.IDLimitedCopyU32FromHistoryFast:
b.printf("wuffs_base__io_writer__%s(\n&%s%s, %s%s, %s%s",
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 8c03c4c..ada5cde 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -223,10 +223,12 @@
"" +
"// --------\n\nstatic inline uint64_t //\nwuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n wuffs_base__slice_u8 src) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = src.len;\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n if (n > 0) {\n memmove(iop_w, src.ptr, n);\n *ptr_iop_w += n;\n }\n return (uint64_t)(n);\n}\n\nstatic inline void //\nwuffs_base__io_writer__limit(uint8_t** ptr_io2_w,\n uint8_t* iop_w,\n uint64_t limit) {\n if (((uint64_t)(*ptr_io2_w - iop_w)) > limit) {\n *ptr_io2_w = iop_w + limit;\n }\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history(uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n " +
" uint32_t distance) {\n if (!distance) {\n return 0;\n }\n uint8_t* p = *ptr_iop_w;\n if ((size_t)(p - io1_w) < (size_t)(distance)) {\n return 0;\n }\n uint8_t* q = p - distance;\n size_t n = (size_t)(io2_w - p);\n if ((size_t)(length) > n) {\n length = (uint32_t)(n);\n } else {\n n = (size_t)(length);\n }\n // TODO: unrolling by 3 seems best for the std/deflate benchmarks, but that\n // is mostly because 3 is the minimum length for the deflate format. This\n // function implementation shouldn't overfit to that one format. Perhaps the\n // limited_copy_u32_from_history Wuffs method should also take an unroll hint\n // argument, and the cgen can look if that argument is the constant\n // expression '3'.\n //\n // See also wuffs_base__io_writer__limited_copy_u32_from_history_fast below.\n for (; n >= 3; n -= 3) {\n *p++ = *q++;\n *p++ = *q++;\n *p++ = *q++;\n }\n for (; n; n--) {\n *p++ = *q++;\n }\n *ptr_iop_w = p;\n return length;\n}\n\n// wuffs_base__io_w" +
- "riter__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions.\n//\n// The caller needs to prove that:\n// - length <= (io2_w - *ptr_iop_w)\n// - distance >= 1\n// - distance <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n uint32_t distance) {\n uint8_t* p = *ptr_iop_w;\n uint8_t* q = p - distance;\n uint32_t n = length;\n for (; n >= 3; n -= 3) {\n *p++ = *q++;\n *p++ = *q++;\n *p++ = *q++;\n }\n for (; n; n--) {\n *p++ = *q++;\n }\n *ptr_iop_w = p;\n return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast" +
- " is\n// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function\n// above, but copies 8 byte chunks at a time.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n// - (length + 8) <= (io2_w - *ptr_iop_w)\n// - distance >= 8\n// - distance <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(\n uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n uint32_t distance) {\n uint8_t* p = *ptr_iop_w;\n uint8_t* q = p - distance;\n uint32_t n = length;\n while (1) {\n memcpy(p, q, 8);\n if (n <= 8) {\n p += n;\n break;\n }\n p += 8;\n q += 8;\n n -= 8;\n }\n *ptr_iop_w = p;\n ret" +
- "urn length;\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n uint32_t length,\n const uint8_t** ptr_iop_r,\n const uint8_t* io2_r) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = length;\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n const uint8_t* iop_r = *ptr_iop_r;\n if (n > ((size_t)(io2_r - iop_r))) {\n n = (size_t)(io2_r - iop_r);\n }\n if (n > 0) {\n memmove(iop_w, iop_r, n);\n *ptr_iop_w += n;\n *ptr_iop_r += n;\n }\n return (uint32_t)(n);\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_slice(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n uint32_t length,\n " +
- " wuffs_base__slice_u8 src) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = src.len;\n if (n > length) {\n n = length;\n }\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n if (n > 0) {\n memmove(iop_w, src.ptr, n);\n *ptr_iop_w += n;\n }\n return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer* //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n uint8_t** ptr_iop_w,\n uint8_t** ptr_io0_w,\n uint8_t** ptr_io1_w,\n uint8_t** ptr_io2_w,\n wuffs_base__slice_u8 data) {\n b->data = data;\n b->meta.wi = 0;\n b->meta.ri = 0;\n b->meta.pos = 0;\n b->meta.closed = false;\n\n *ptr_iop_w = data.ptr;\n *ptr_io0_w = data.ptr;\n *ptr_io1_w = data.ptr;\n *ptr_io2_w = data.ptr + data.len;\n\n return b;\n}\n\n" +
+ "riter__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions.\n//\n// The caller needs to prove that:\n// - length <= (io2_w - *ptr_iop_w)\n// - distance >= 1\n// - distance <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n uint32_t distance) {\n uint8_t* p = *ptr_iop_w;\n uint8_t* q = p - distance;\n uint32_t n = length;\n for (; n >= 3; n -= 3) {\n *p++ = *q++;\n *p++ = *q++;\n *p++ = *q++;\n }\n for (; n; n--) {\n *p++ = *q++;\n }\n *ptr_iop_w = p;\n return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_dist" +
+ "ance_1_fast\n// copies the previous byte (the one immediately before *ptr_iop_w), copying 8\n// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n// - (length + 8) <= (io2_w - *ptr_iop_w)\n// - distance == 1\n// - distance <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(\n uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n uint32_t distance) {\n uint8_t* p = *ptr_iop_w;\n uint64_t x = p[-1];\n x |= x << 8;\n x |= x << 16;\n x |= x << 32;\n uint32_t n = length;\n while (1) {\n wuffs_base__poke_u64le__no_bounds_check(" +
+ "p, x);\n if (n <= 8) {\n p += n;\n break;\n }\n p += 8;\n n -= 8;\n }\n *ptr_iop_w = p;\n return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is\n// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function\n// above, but copies 8 byte chunks at a time.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n// - (length + 8) <= (io2_w - *ptr_iop_w)\n// - distance >= 8\n// - distance <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(\n uint8_t** ptr_iop_w,\n uint8_t* io1_w,\n uint8_t* io2_w,\n uint32_t length,\n uint32_t distance) {\n uint8_t* p = *ptr_iop_w;\n u" +
+ "int8_t* q = p - distance;\n uint32_t n = length;\n while (1) {\n memcpy(p, q, 8);\n if (n <= 8) {\n p += n;\n break;\n }\n p += 8;\n q += 8;\n n -= 8;\n }\n *ptr_iop_w = p;\n return length;\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n uint32_t length,\n const uint8_t** ptr_iop_r,\n const uint8_t* io2_r) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = length;\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n const uint8_t* iop_r = *ptr_iop_r;\n if (n > ((size_t)(io2_r - iop_r))) {\n n = (size_t)(io2_r - iop_r);\n }\n if (n > 0) {\n memmove(iop_w, iop_r, n);\n *ptr_iop_w += n;\n *ptr_iop_r += n;\n }\n return (uint32_t)(n);\n}\n\nstatic inline uint32_t //\nwuffs_base__io_writer__limited_co" +
+ "py_u32_from_slice(uint8_t** ptr_iop_w,\n uint8_t* io2_w,\n uint32_t length,\n wuffs_base__slice_u8 src) {\n uint8_t* iop_w = *ptr_iop_w;\n size_t n = src.len;\n if (n > length) {\n n = length;\n }\n if (n > ((size_t)(io2_w - iop_w))) {\n n = (size_t)(io2_w - iop_w);\n }\n if (n > 0) {\n memmove(iop_w, src.ptr, n);\n *ptr_iop_w += n;\n }\n return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer* //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n uint8_t** ptr_iop_w,\n uint8_t** ptr_io0_w,\n uint8_t** ptr_io1_w,\n uint8_t** ptr_io2_w,\n wuffs_base__slice_u8 data) {\n b->data = data;\n b->meta.wi = 0;\n b->meta.ri = 0;\n b->meta.pos = 0;\n b->meta.closed = false;\n\n *ptr_iop_w = data.ptr;\n *ptr_io0_w = data.ptr;\n *ptr_io1_" +
+ "w = data.ptr;\n *ptr_io2_w = data.ptr + data.len;\n\n return b;\n}\n\n" +
"" +
"// ---------------- I/O (Utility)\n\n#define wuffs_base__utility__empty_io_reader wuffs_base__empty_io_reader\n#define wuffs_base__utility__empty_io_writer wuffs_base__empty_io_writer\n" +
""
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 4bc325b..dc4efdb 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -549,6 +549,13 @@
// For now, that's all implicitly checked (i.e. hard coded).
"io_writer.limited_copy_u32_from_history_8_byte_chunks_fast!(up_to: u32, distance: u32) u32",
+ // TODO: this should have explicit pre-conditions:
+ // - (up_to + 8) <= this.length()
+ // - distance == 1
+ // - distance <= this.history_length()
+ // For now, that's all implicitly checked (i.e. hard coded).
+ "io_writer.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast!(up_to: u32, distance: u32) u32",
+
// ---- token_writer
"token_writer.write_simple_token_fast!(" +
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index d718e4d..390f33b 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1160,13 +1160,18 @@
return bounds{}, err
}
+ } else if method == t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast {
+ if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, nil, one); err != nil {
+ return bounds{}, err
+ }
+
} else if method == t.IDLimitedCopyU32FromHistory8ByteChunksFast {
- if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, eight); err != nil {
+ if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, eight, nil); err != nil {
return bounds{}, err
}
} else if method == t.IDLimitedCopyU32FromHistoryFast {
- if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), nil, one); err != nil {
+ if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), nil, one, nil); err != nil {
return bounds{}, err
}
@@ -1272,10 +1277,11 @@
return fmt.Errorf("check: could not prove %s.can_undo_byte()", recv.Str(q.tm))
}
-func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node, adj *big.Int, minDistance *big.Int) error {
+func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node, adj *big.Int, minDistance *big.Int, exactDistance *big.Int) error {
// As per cgen's io-private.h, there are three pre-conditions:
// - (upTo + adj) <= this.length()
- // - distance >= minDistance
+ // - either (distance >= minDistance) or (distance == exactDistance),
+ // depending on which of minDistance and exactDistance is non-nil.
// - distance <= this.history_length()
//
// adj may be nil, in which case (upTo + adj) is just upTo.
@@ -1335,8 +1341,8 @@
}
// Check "distance >= minDistance".
-check1:
- for {
+check1a:
+ for minDistance != nil {
for _, x := range q.facts {
if x.Operator() != t.IDXBinaryGreaterEq {
continue
@@ -1347,11 +1353,29 @@
if rcv := x.RHS().AsExpr().ConstValue(); (rcv == nil) || (rcv.Cmp(minDistance) < 0) {
continue
}
- break check1
+ break check1a
}
return fmt.Errorf("check: could not prove %s >= %v", distance.Str(q.tm), minDistance)
}
+ // Check "distance == exactDistance".
+check1b:
+ for exactDistance != nil {
+ for _, x := range q.facts {
+ if x.Operator() != t.IDXBinaryEqEq {
+ continue
+ }
+ if lhs := x.LHS().AsExpr(); !lhs.Eq(distance) {
+ continue
+ }
+ if rcv := x.RHS().AsExpr().ConstValue(); (rcv == nil) || (rcv.Cmp(exactDistance) != 0) {
+ continue
+ }
+ break check1b
+ }
+ return fmt.Errorf("check: could not prove %s == %v", distance.Str(q.tm), exactDistance)
+ }
+
// Check "distance <= this.history_length()".
check2:
for {
diff --git a/lang/token/list.go b/lang/token/list.go
index c046b42..b8f33e3 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -504,13 +504,14 @@
IDSkipU32 = ID(0x16B)
IDSkipU32Fast = ID(0x16C)
- IDCopyFromSlice = ID(0x170)
- IDLimitedCopyU32FromHistory = ID(0x171)
- IDLimitedCopyU32FromHistory8ByteChunksFast = ID(0x172)
- IDLimitedCopyU32FromHistoryFast = ID(0x173)
- IDLimitedCopyU32FromReader = ID(0x174)
- IDLimitedCopyU32FromSlice = ID(0x175)
- IDLimitedCopyU32ToSlice = ID(0x176)
+ IDCopyFromSlice = ID(0x170)
+ IDLimitedCopyU32FromHistory = ID(0x171)
+ IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast = ID(0x172)
+ IDLimitedCopyU32FromHistory8ByteChunksFast = ID(0x173)
+ IDLimitedCopyU32FromHistoryFast = ID(0x174)
+ IDLimitedCopyU32FromReader = ID(0x175)
+ IDLimitedCopyU32FromSlice = ID(0x176)
+ IDLimitedCopyU32ToSlice = ID(0x177)
// -------- 0x180 block.
@@ -929,13 +930,14 @@
IDSkipU32: "skip_u32",
IDSkipU32Fast: "skip_u32_fast",
- IDCopyFromSlice: "copy_from_slice",
- IDLimitedCopyU32FromHistory: "limited_copy_u32_from_history",
- IDLimitedCopyU32FromHistory8ByteChunksFast: "limited_copy_u32_from_history_8_byte_chunks_fast",
- IDLimitedCopyU32FromHistoryFast: "limited_copy_u32_from_history_fast",
- IDLimitedCopyU32FromReader: "limited_copy_u32_from_reader",
- IDLimitedCopyU32FromSlice: "limited_copy_u32_from_slice",
- IDLimitedCopyU32ToSlice: "limited_copy_u32_to_slice",
+ IDCopyFromSlice: "copy_from_slice",
+ IDLimitedCopyU32FromHistory: "limited_copy_u32_from_history",
+ IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast: "limited_copy_u32_from_history_8_byte_chunks_distance_1_fast",
+ IDLimitedCopyU32FromHistory8ByteChunksFast: "limited_copy_u32_from_history_8_byte_chunks_fast",
+ IDLimitedCopyU32FromHistoryFast: "limited_copy_u32_from_history_fast",
+ IDLimitedCopyU32FromReader: "limited_copy_u32_from_reader",
+ IDLimitedCopyU32FromSlice: "limited_copy_u32_from_slice",
+ IDLimitedCopyU32ToSlice: "limited_copy_u32_to_slice",
// -------- 0x180 block.
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 543fda8..d9756cc 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -10466,6 +10466,46 @@
return length;
}
+// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast
+// copies the previous byte (the one immediately before *ptr_iop_w), copying 8
+// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+// - (length + 8) <= (io2_w - *ptr_iop_w)
+// - distance == 1
+// - distance <= (*ptr_iop_w - io1_w)
+static inline uint32_t //
+wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(
+ uint8_t** ptr_iop_w,
+ uint8_t* io1_w,
+ uint8_t* io2_w,
+ uint32_t length,
+ uint32_t distance) {
+ uint8_t* p = *ptr_iop_w;
+ uint64_t x = p[-1];
+ x |= x << 8;
+ x |= x << 16;
+ x |= x << 32;
+ uint32_t n = length;
+ while (1) {
+ wuffs_base__poke_u64le__no_bounds_check(p, x);
+ if (n <= 8) {
+ p += n;
+ break;
+ }
+ p += 8;
+ n -= 8;
+ }
+ *ptr_iop_w = p;
+ return length;
+}
+
// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is
// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function
// above, but copies 8 byte chunks at a time.
@@ -26485,6 +26525,9 @@
if ((v_dist_minus_1 + 1) >= 8) {
wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
&iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+ } else if ((v_dist_minus_1 + 1) == 1) {
+ wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(
+ &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
} else {
wuffs_base__io_writer__limited_copy_u32_from_history_fast(
&iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
@@ -26947,6 +26990,9 @@
if ((v_dist_minus_1 + 1) >= 8) {
wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
&iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+ } else if ((v_dist_minus_1 + 1) == 1) {
+ wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(
+ &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
} else {
wuffs_base__io_writer__limited_copy_u32_from_history_fast(
&iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
diff --git a/std/deflate/decode_huffman_bmi2.wuffs b/std/deflate/decode_huffman_bmi2.wuffs
index cad7dc1..991cee8 100644
--- a/std/deflate/decode_huffman_bmi2.wuffs
+++ b/std/deflate/decode_huffman_bmi2.wuffs
@@ -253,6 +253,11 @@
if (dist_minus_1 + 1) >= 8 {
args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!(
up_to: length, distance: (dist_minus_1 + 1))
+ } else if (dist_minus_1 + 1) == 1 {
+ // (distance == 1) is essentially RLE (Run Length Encoding). It
+ // happens often enough that it's worth special-casing.
+ args.dst.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast!(
+ up_to: length, distance: (dist_minus_1 + 1))
} else {
args.dst.limited_copy_u32_from_history_fast!(
up_to: length, distance: (dist_minus_1 + 1))
diff --git a/std/deflate/decode_huffman_fast64.wuffs b/std/deflate/decode_huffman_fast64.wuffs
index 8290b3c..a7dc731 100644
--- a/std/deflate/decode_huffman_fast64.wuffs
+++ b/std/deflate/decode_huffman_fast64.wuffs
@@ -252,6 +252,11 @@
if (dist_minus_1 + 1) >= 8 {
args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!(
up_to: length, distance: (dist_minus_1 + 1))
+ } else if (dist_minus_1 + 1) == 1 {
+ // (distance == 1) is essentially RLE (Run Length Encoding). It
+ // happens often enough that it's worth special-casing.
+ args.dst.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast!(
+ up_to: length, distance: (dist_minus_1 + 1))
} else {
args.dst.limited_copy_u32_from_history_fast!(
up_to: length, distance: (dist_minus_1 + 1))