std/lzma: use 8-byte-copy if applicable
name old speed new speed delta
wuffs_lzma_decode_100k/clang14 56.2MB/s ± 0% 60.7MB/s ± 0% +7.96% (p=0.008 n=5+5)
wuffs_lzma_decode_100k/gcc12 56.4MB/s ± 1% 61.7MB/s ± 1% +9.44% (p=0.008 n=5+5)
$ time example-mzcat < linux-6.8.2.tar.xz > /dev/null
Before: user 0m8.396s
After: user 0m8.392s
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index e132c24..b1e4e31 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -308,6 +308,52 @@
return length;
}
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp
+// copies the previous byte (the one immediately before *ptr_iop_w), copying 8
+// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.
+// It also returns the cusp: a byte pair (as a u16le) being the last byte of
+// and next byte after the copied history.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+// - length >= 1
+// - (length + 8) <= (io2_w - *ptr_iop_w)
+// - distance == 1
+// - distance <= (*ptr_iop_w - io0_w)
+static inline uint32_t //
+wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp(
+ uint8_t** ptr_iop_w,
+ uint8_t* io0_w,
+ uint8_t* io2_w,
+ uint32_t length,
+ uint32_t distance) {
+ uint8_t* p = *ptr_iop_w;
+ uint8_t* q = p - distance;
+ uint64_t x = p[-1];
+ x |= x << 8;
+ x |= x << 16;
+ x |= x << 32;
+ uint32_t n = length;
+ while (1) {
+ wuffs_base__poke_u64le__no_bounds_check(p, x);
+ if (n <= 8) {
+ p += n;
+ q += n;
+ break;
+ }
+ p += 8;
+ q += 8;
+ n -= 8;
+ }
+ *ptr_iop_w = p;
+ return (uint32_t)wuffs_base__peek_u16le__no_bounds_check(q - 1);
+}
+
// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast
// is like the
// wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast function
@@ -348,6 +394,49 @@
return length;
}
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp
+// is like the
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast function
+// above, but copies 8 byte chunks at a time. It also returns the cusp: a byte
+// pair (as a u16le) being the last byte of and next byte after the copied
+// history.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+// - length >= 1
+// - (length + 8) <= (io2_w - *ptr_iop_w)
+// - distance >= 8
+// - distance <= (*ptr_iop_w - io0_w)
+static inline uint32_t //
+wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp(
+ uint8_t** ptr_iop_w,
+ uint8_t* io0_w,
+ uint8_t* io2_w,
+ uint32_t length,
+ uint32_t distance) {
+ uint8_t* p = *ptr_iop_w;
+ uint8_t* q = p - distance;
+ uint32_t n = length;
+ while (1) {
+ memcpy(p, q, 8);
+ if (n <= 8) {
+ p += n;
+ q += n;
+ break;
+ }
+ p += 8;
+ q += 8;
+ n -= 8;
+ }
+ *ptr_iop_w = p;
+ return (uint32_t)wuffs_base__peek_u16le__no_bounds_check(q - 1);
+}
+
static inline uint32_t //
wuffs_private_impl__io_writer__limited_copy_u32_from_reader(
uint8_t** ptr_iop_w,
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 555624e..b7aec79 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -321,7 +321,9 @@
switch method {
case t.IDLimitedCopyU32FromHistory,
t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast,
+ t.IDLimitedCopyU32FromHistory8ByteChunksDistance1FastReturnCusp,
t.IDLimitedCopyU32FromHistory8ByteChunksFast,
+ t.IDLimitedCopyU32FromHistory8ByteChunksFastReturnCusp,
t.IDLimitedCopyU32FromHistoryFast,
t.IDLimitedCopyU32FromHistoryFastReturnCusp:
b.printf("wuffs_private_impl__io_writer__%s(\n&%s%s, %s%s, %s%s",
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 8e113c6..4f5d0f5 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -613,11 +613,27 @@
// TODO: this should have explicit pre-conditions:
// - up_to >= 1
// - (up_to + 8) <= this.length()
+ // - distance >= 8
+ // - distance <= this.history_length()
+ // For now, that's all implicitly checked (i.e. hard coded).
+ "io_writer.limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp!(up_to: u32, distance: u32) u32[..= 0xFFFF]",
+
+ // TODO: this should have explicit pre-conditions:
+ // - up_to >= 1
+ // - (up_to + 8) <= this.length()
// - distance == 1
// - distance <= this.history_length()
// For now, that's all implicitly checked (i.e. hard coded).
"io_writer.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast!(up_to: u32, distance: u32) u32",
+ // TODO: this should have explicit pre-conditions:
+ // - up_to >= 1
+ // - (up_to + 8) <= this.length()
+ // - distance == 1
+ // - distance <= this.history_length()
+ // For now, that's all implicitly checked (i.e. hard coded).
+ "io_writer.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp!(up_to: u32, distance: u32) u32[..= 0xFFFF]",
+
// ---- token_writer
"token_writer.write_simple_token_fast!(" +
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index 89d21f1..51b64e7 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1198,12 +1198,14 @@
return bounds{}, err
}
- } else if method == t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast {
+ } else if (method == t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast) ||
+ (method == t.IDLimitedCopyU32FromHistory8ByteChunksDistance1FastReturnCusp) {
if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, nil, one); err != nil {
return bounds{}, err
}
- } else if method == t.IDLimitedCopyU32FromHistory8ByteChunksFast {
+ } else if (method == t.IDLimitedCopyU32FromHistory8ByteChunksFast) ||
+ (method == t.IDLimitedCopyU32FromHistory8ByteChunksFastReturnCusp) {
if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, eight, nil); err != nil {
return bounds{}, err
}
diff --git a/lang/token/list.go b/lang/token/list.go
index 4c071ad..404df85 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -522,15 +522,17 @@
IDSkipU32 = ID(0x16C)
IDSkipU32Fast = ID(0x16D)
- IDCopyFromSlice = ID(0x170)
- IDLimitedCopyU32FromHistory = ID(0x171)
- IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast = ID(0x172)
- IDLimitedCopyU32FromHistory8ByteChunksFast = ID(0x173)
- IDLimitedCopyU32FromHistoryFast = ID(0x174)
- IDLimitedCopyU32FromHistoryFastReturnCusp = ID(0x175)
- IDLimitedCopyU32FromReader = ID(0x176)
- IDLimitedCopyU32FromSlice = ID(0x177)
- IDLimitedCopyU32ToSlice = ID(0x178)
+ IDCopyFromSlice = ID(0x170)
+ IDLimitedCopyU32FromHistory = ID(0x171)
+ IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast = ID(0x172)
+ IDLimitedCopyU32FromHistory8ByteChunksDistance1FastReturnCusp = ID(0x173)
+ IDLimitedCopyU32FromHistory8ByteChunksFast = ID(0x174)
+ IDLimitedCopyU32FromHistory8ByteChunksFastReturnCusp = ID(0x175)
+ IDLimitedCopyU32FromHistoryFast = ID(0x176)
+ IDLimitedCopyU32FromHistoryFastReturnCusp = ID(0x177)
+ IDLimitedCopyU32FromReader = ID(0x178)
+ IDLimitedCopyU32FromSlice = ID(0x179)
+ IDLimitedCopyU32ToSlice = ID(0x17A)
// -------- 0x180 block.
@@ -973,13 +975,15 @@
IDCopyFromSlice: "copy_from_slice",
IDLimitedCopyU32FromHistory: "limited_copy_u32_from_history",
- IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast: "limited_copy_u32_from_history_8_byte_chunks_distance_1_fast",
- IDLimitedCopyU32FromHistory8ByteChunksFast: "limited_copy_u32_from_history_8_byte_chunks_fast",
- IDLimitedCopyU32FromHistoryFast: "limited_copy_u32_from_history_fast",
- IDLimitedCopyU32FromHistoryFastReturnCusp: "limited_copy_u32_from_history_fast_return_cusp",
- IDLimitedCopyU32FromReader: "limited_copy_u32_from_reader",
- IDLimitedCopyU32FromSlice: "limited_copy_u32_from_slice",
- IDLimitedCopyU32ToSlice: "limited_copy_u32_to_slice",
+ IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast: "limited_copy_u32_from_history_8_byte_chunks_distance_1_fast",
+ IDLimitedCopyU32FromHistory8ByteChunksDistance1FastReturnCusp: "limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp",
+ IDLimitedCopyU32FromHistory8ByteChunksFast: "limited_copy_u32_from_history_8_byte_chunks_fast",
+ IDLimitedCopyU32FromHistory8ByteChunksFastReturnCusp: "limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp",
+ IDLimitedCopyU32FromHistoryFast: "limited_copy_u32_from_history_fast",
+ IDLimitedCopyU32FromHistoryFastReturnCusp: "limited_copy_u32_from_history_fast_return_cusp",
+ IDLimitedCopyU32FromReader: "limited_copy_u32_from_reader",
+ IDLimitedCopyU32FromSlice: "limited_copy_u32_from_slice",
+ IDLimitedCopyU32ToSlice: "limited_copy_u32_to_slice",
// -------- 0x180 block.
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 5ae002d..ba174db 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -15705,6 +15705,52 @@
return length;
}
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp
+// copies the previous byte (the one immediately before *ptr_iop_w), copying 8
+// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.
+// It also returns the cusp: a byte pair (as a u16le) being the last byte of
+// and next byte after the copied history.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+// - length >= 1
+// - (length + 8) <= (io2_w - *ptr_iop_w)
+// - distance == 1
+// - distance <= (*ptr_iop_w - io0_w)
+static inline uint32_t //
+wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp(
+ uint8_t** ptr_iop_w,
+ uint8_t* io0_w,
+ uint8_t* io2_w,
+ uint32_t length,
+ uint32_t distance) {
+ uint8_t* p = *ptr_iop_w;
+ uint8_t* q = p - distance;
+ uint64_t x = p[-1];
+ x |= x << 8;
+ x |= x << 16;
+ x |= x << 32;
+ uint32_t n = length;
+ while (1) {
+ wuffs_base__poke_u64le__no_bounds_check(p, x);
+ if (n <= 8) {
+ p += n;
+ q += n;
+ break;
+ }
+ p += 8;
+ q += 8;
+ n -= 8;
+ }
+ *ptr_iop_w = p;
+ return (uint32_t)wuffs_base__peek_u16le__no_bounds_check(q - 1);
+}
+
// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast
// is like the
// wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast function
@@ -15745,6 +15791,49 @@
return length;
}
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp
+// is like the
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast function
+// above, but copies 8 byte chunks at a time. It also returns the cusp: a byte
+// pair (as a u16le) being the last byte of and next byte after the copied
+// history.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+// - length >= 1
+// - (length + 8) <= (io2_w - *ptr_iop_w)
+// - distance >= 8
+// - distance <= (*ptr_iop_w - io0_w)
+static inline uint32_t //
+wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp(
+ uint8_t** ptr_iop_w,
+ uint8_t* io0_w,
+ uint8_t* io2_w,
+ uint32_t length,
+ uint32_t distance) {
+ uint8_t* p = *ptr_iop_w;
+ uint8_t* q = p - distance;
+ uint32_t n = length;
+ while (1) {
+ memcpy(p, q, 8);
+ if (n <= 8) {
+ p += n;
+ q += n;
+ break;
+ }
+ p += 8;
+ q += 8;
+ n -= 8;
+ }
+ *ptr_iop_w = p;
+ return (uint32_t)wuffs_base__peek_u16le__no_bounds_check(q - 1);
+}
+
static inline uint32_t //
wuffs_private_impl__io_writer__limited_copy_u32_from_reader(
uint8_t** ptr_iop_w,
@@ -52817,15 +52906,22 @@
wuffs_private_impl__io_writer__limited_copy_u32_from_slice(
&iop_a_dst, io2_a_dst,v_adj_dist, wuffs_base__slice_u8__subslice_i(a_workbuf, v_wb_index));
v_len -= v_adj_dist;
- if ((((uint64_t)(v_len)) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)(v_dist)) > ((uint64_t)(iop_a_dst - io0_a_dst)))) {
+ if ((((uint64_t)(v_len)) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)((v_len + 8u))) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)(v_dist)) > ((uint64_t)(iop_a_dst - io0_a_dst)))) {
status = wuffs_base__make_status(wuffs_lzma__error__internal_error_inconsistent_dictionary_state);
goto exit;
}
}
- v_match_cusp = wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast_return_cusp(
- &iop_a_dst, io0_a_dst, io2_a_dst, v_len, v_dist);
- v_match_byte = (v_match_cusp >> 8u);
- v_prev_byte = ((uint8_t)(v_match_cusp));
+ if (v_dist >= 8u) {
+ v_match_cusp = wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp(
+ &iop_a_dst, io0_a_dst, io2_a_dst, v_len, v_dist);
+ v_match_byte = (v_match_cusp >> 8u);
+ v_prev_byte = ((uint8_t)(v_match_cusp));
+ } else {
+ v_match_cusp = wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast_return_cusp(
+ &iop_a_dst, io0_a_dst, io2_a_dst, v_len, v_dist);
+ v_match_byte = (v_match_cusp >> 8u);
+ v_prev_byte = ((uint8_t)(v_match_cusp));
+ }
}
label__outer__break:;
self->private_impl.f_stashed_bytes[0u] = v_prev_byte;
diff --git a/std/lzma/decode_bitstream_fast.wuffs b/std/lzma/decode_bitstream_fast.wuffs
index 5f2d213..03fa8f4 100644
--- a/std/lzma/decode_bitstream_fast.wuffs
+++ b/std/lzma/decode_bitstream_fast.wuffs
@@ -932,6 +932,7 @@
}
pos ~mod+= len as base.u64
assert (len as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 282)
+ assert ((len + 8) as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 282)
// Copy from the args.workbuf history, if necessary.
if (dist as base.u64) > args.dst.history_length() {
@@ -984,6 +985,7 @@
len -= adj_dist
assert len >= 1
if ((len as base.u64) > args.dst.length()) or
+ (((len + 8) as base.u64) > args.dst.length()) or
((dist as base.u64) > args.dst.history_length()) {
return "#internal error: inconsistent dictionary state"
}
@@ -993,11 +995,19 @@
assert len >= 1
assert dist >= 1
assert (len as base.u64) <= args.dst.length()
+ assert ((len + 8) as base.u64) <= args.dst.length()
assert (dist as base.u64) <= args.dst.history_length()
- match_cusp = args.dst.limited_copy_u32_from_history_fast_return_cusp!(
- up_to: len, distance: dist)
- match_byte = match_cusp >> 8
- prev_byte = (match_cusp & 0xFF) as base.u8
+ if dist >= 8 {
+ match_cusp = args.dst.limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp!(
+ up_to: len, distance: dist)
+ match_byte = match_cusp >> 8
+ prev_byte = (match_cusp & 0xFF) as base.u8
+ } else {
+ match_cusp = args.dst.limited_copy_u32_from_history_fast_return_cusp!(
+ up_to: len, distance: dist)
+ match_byte = match_cusp >> 8
+ prev_byte = (match_cusp & 0xFF) as base.u8
+ }
} endwhile.outer
this.stashed_bytes[0] = prev_byte