Let rac.Writer support shared resources
diff --git a/lib/rac/chunk_writer.go b/lib/rac/chunk_writer.go
index d66c8ed..73eeee9 100644
--- a/lib/rac/chunk_writer.go
+++ b/lib/rac/chunk_writer.go
@@ -43,6 +43,12 @@
b[7] = byte(v >> 56)
}
+// OptResource is an option type, optionally holding a ChunkWriter-specific
+// identifier for a shared resource.
+//
+// Zero means that the option is not taken: no shared resource is used.
+type OptResource uint32
+
// ChunkWriter provides a relatively simple way to write a RAC file - one that
// is created starting from nothing, as opposed to incrementally modifying an
// existing RAC file.
diff --git a/lib/rac/rac.go b/lib/rac/rac.go
index 4a97a18..cb7ed47 100644
--- a/lib/rac/rac.go
+++ b/lib/rac/rac.go
@@ -62,12 +62,6 @@
var indexLocationAtEndMagic = []byte("\x72\xC3\x63\x00")
-// OptResource is an option type, optionally holding a Writer-specific
-// identifier for a shared resource.
-//
-// Zero means that the option is not taken: no shared resource is used.
-type OptResource uint32
-
var (
errCChunkSizeIsTooSmall = errors.New("rac: CChunkSize is too small")
errILAEndTempFile = errors.New("rac: IndexLocationAtEnd requires a nil TempFile")
diff --git a/lib/rac/writer.go b/lib/rac/writer.go
index a69c140..1142b3f 100644
--- a/lib/rac/writer.go
+++ b/lib/rac/writer.go
@@ -127,6 +127,10 @@
b.curr = nil
}
+// NoResourceUsed is returned by CodecWriter.Compress to indicate that no
+// secondary or tertiary resource was used in the compression.
+const NoResourceUsed int = -1
+
// CodecWriter specializes a Writer to encode a specific compression codec.
type CodecWriter interface {
// Compress returns the codec-specific compressed form of the concatenation
@@ -135,12 +139,19 @@
// The source bytes are conceptually one continuous data stream, but is
// presented in two slices to avoid unnecessary copying in the code that
// calls Compress. One or both of p and q may be an empty slice. A
- // Compress(p, q) is equivalent to, but often more efficient than:
+ // Compress(p, q, etc) is equivalent to, but often more efficient than:
// combined := []byte(nil)
// combined = append(combined, p...)
// combined = append(combined, q...)
- // Compress(combined, nil)
- Compress(p []byte, q []byte) (Codec, []byte, error)
+ // Compress(combined, nil, etc)
+ //
+ // Returning a secondaryResource or tertiaryResource within the range ((0
+ // <= i) && (i < len(resourcesData))) indicates that resourcesData[i] was
+ // used in the compression. A value outside of that range means that no
+ // resource was used in the secondary and/or tertiary slot. The
+ // NoResourceUsed constant (-1) is always outside that range.
+ Compress(p []byte, q []byte, resourcesData [][]byte) (
+ codec Codec, compressed []byte, secondaryResource int, tertiaryResource int, retErr error)
// Cut modifies encoded's contents such that encoded[:encodedLen] is valid
// codec-compressed data, assuming that encoded starts off containing valid
@@ -151,7 +162,13 @@
// Decompressing that modified, shorter byte slice produces a prefix (of
// length decodedLen) of the decompression of the original, longer byte
// slice.
- Cut(codec Codec, encoded []byte, maxEncodedLen int) (encodedLen int, decodedLen int, retErr error)
+ Cut(codec Codec, encoded []byte, maxEncodedLen int) (
+ encodedLen int, decodedLen int, retErr error)
+
+ // WrapResource returns the Codec-specific encoding of the raw resource.
+ // For example, if raw is the bytes of a shared LZ77-style dictionary,
+ // WrapResource may prepend or append length and checksum data.
+ WrapResource(raw []byte) ([]byte, error)
}
// Writer provides a relatively simple way to write a RAC file - one that is
@@ -253,6 +270,22 @@
// will be used.
DChunkSize uint64
+ // ResourcesData is a list of resources that can be shared across otherwise
+ // independently compressed chunks.
+ //
+ // The exact semantics for a resource depends on the codec. For example,
+ // the Zlib codec interprets them as shared LZ77-style dictionaries.
+ //
+ // One way to generate shared dictionaries from sub-slices of a single
+ // source file is the command line tool at
+ // https://github.com/google/brotli/blob/master/research/dictionary_generator.cc
+ ResourcesData [][]byte
+
+ // resourcesIDs is the OptResource for each ResourcesData element. Zero
+ // means that corresponding resource is not yet used (and not yet written
+ // to the RAC file).
+ resourcesIDs []OptResource
+
// cChunkSize and dChunkSize are copies of CChunkSize and DChunkSize,
// validated during the initialize method. For example, if both CChunkSize
// and DChunkSize are zero, dChunkSize will be defaultDChunkSize.
@@ -287,6 +320,7 @@
w.err = errInvalidCodecWriter
return w.err
}
+ w.resourcesIDs = make([]OptResource, len(w.ResourcesData))
if w.DChunkSize > 0 {
w.dChunkSize = w.DChunkSize
@@ -306,6 +340,32 @@
return nil
}
+// useResource marks a shared resource as used, if it wasn't already marked.
+//
+// i is the index into both the w.ResourcesData and w.resourcesIDs slices. It
+// is valid to pass an i outside the range [0, len(w.resourcesIDs)), in which
+// case the call is a no-op.
+func (w *Writer) useResource(i int) (OptResource, error) {
+ if (i < 0) || (len(w.resourcesIDs) < i) {
+ return 0, nil
+ }
+ if id := w.resourcesIDs[i]; id != 0 {
+ return id, nil
+ }
+ wrapped, err := w.CodecWriter.WrapResource(w.ResourcesData[i])
+ if err != nil {
+ w.err = err
+ return 0, err
+ }
+ id, err := w.chunkWriter.AddResource(wrapped)
+ if err != nil {
+ w.err = err
+ return 0, err
+ }
+ w.resourcesIDs[i] = id
+ return id, nil
+}
+
// Write implements io.Writer.
func (w *Writer) Write(p []byte) (int, error) {
if err := w.initialize(); err != nil {
@@ -342,12 +402,22 @@
if len(peek1) == 0 {
peek0 = stripTrailingZeroes(peek0)
}
- codec, cBytes, err := w.CodecWriter.Compress(peek0, peek1)
+
+ codec, cBytes, index2, index3, err :=
+ w.CodecWriter.Compress(peek0, peek1, w.ResourcesData)
+ if err != nil {
+ return err
+ }
+ res2, err := w.useResource(index2)
+ if err != nil {
+ return err
+ }
+ res3, err := w.useResource(index3)
if err != nil {
return err
}
- if err := w.chunkWriter.AddChunk(dSize, cBytes, 0, 0, codec); err != nil {
+ if err := w.chunkWriter.AddChunk(dSize, cBytes, res2, res3, codec); err != nil {
w.err = err
return err
}
@@ -402,7 +472,17 @@
func (w *Writer) tryCChunk(targetDChunkSize uint64, force bool) error {
peek0, peek1 := w.uncompressed.peek(targetDChunkSize)
dSize := uint64(len(peek0)) + uint64(len(peek1))
- codec, cBytes, err := w.CodecWriter.Compress(peek0, peek1)
+
+ codec, cBytes, index2, index3, err :=
+ w.CodecWriter.Compress(peek0, peek1, w.ResourcesData)
+ if err != nil {
+ return err
+ }
+ res2, err := w.useResource(index2)
+ if err != nil {
+ return err
+ }
+ res3, err := w.useResource(index3)
if err != nil {
return err
}
@@ -414,7 +494,7 @@
}
fallthrough
case uint64(len(cBytes)) == w.cChunkSize:
- if err := w.chunkWriter.AddChunk(dSize, cBytes, 0, 0, codec); err != nil {
+ if err := w.chunkWriter.AddChunk(dSize, cBytes, res2, res3, codec); err != nil {
w.err = err
return err
}
@@ -431,7 +511,7 @@
w.err = errCChunkSizeIsTooSmall
return w.err
}
- if err := w.chunkWriter.AddChunk(uint64(dLen), cBytes[:eLen], 0, 0, codec); err != nil {
+ if err := w.chunkWriter.AddChunk(uint64(dLen), cBytes[:eLen], res2, res3, codec); err != nil {
w.err = err
return err
}
diff --git a/lib/raczlib/raczlib.go b/lib/raczlib/raczlib.go
index 9fe9cdf..c1c3344 100644
--- a/lib/raczlib/raczlib.go
+++ b/lib/raczlib/raczlib.go
@@ -60,6 +60,18 @@
return err
}
+// suffix32K returns the last 32 KiB of b, or if b is shorter than that, it
+// returns just b.
+//
+// The Zlib format only supports up to 32 KiB of history or shared dictionary.
+func suffix32K(b []byte) []byte {
+ const n = 32 * 1024
+ if len(b) > n {
+ return b[len(b)-n:]
+ }
+ return b
+}
+
// CodecReader specializes a rac.Reader to decode Zlib-compressed chunks.
type CodecReader struct {
// cachedZlibReader lets us re-use the memory allocated for a zlib reader,
@@ -164,38 +176,99 @@
// CodecWriter specializes a rac.Writer to encode Zlib-compressed chunks.
type CodecWriter struct {
+ // These fields help reduce memory allocation by re-using previous byte
+ // buffers or the zlib.Writer.
compressed bytes.Buffer
zlibWriter *zlib.Writer
+ stash []byte
}
// Compress implements rac.CodecWriter.
-func (w *CodecWriter) Compress(p []byte, q []byte) (rac.Codec, []byte, error) {
- w.compressed.Reset()
+func (w *CodecWriter) Compress(p []byte, q []byte, resourcesData [][]byte) (
+ codec rac.Codec, compressed []byte, secondaryResource int, tertiaryResource int, retErr error) {
- if w.zlibWriter == nil {
- zw, err := zlib.NewWriterLevel(&w.compressed, zlib.BestCompression)
+ secondaryResource = rac.NoResourceUsed
+
+ // Compress p+q without any shared dictionary.
+ baseline, err := w.compress(p, q, nil)
+ if err != nil {
+ return 0, nil, 0, 0, err
+ }
+ // If there aren't any potential shared dictionaries, or if the baseline
+ // compressed form is small, just return the baseline.
+ const minBaselineSizeToConsiderDictionaries = 256
+ if (len(resourcesData) == 0) || (len(baseline) < minBaselineSizeToConsiderDictionaries) {
+ return rac.CodecZlib, baseline, secondaryResource, rac.NoResourceUsed, nil
+ }
+
+ // w.stash keeps a copy of the best compression so far. Every call to
+ // w.compress can clobber the bytes previously returned by w.compress.
+ w.stash = append(w.stash[:0], baseline...)
+ compressed = w.stash
+
+ // Only use a shared dictionary if it results in something smaller than
+ // 98.4375% (an arbitrary heuristic threshold) of the baseline. Otherwise,
+ // it's arguably not worth the additional complexity.
+ threshold := (len(baseline) / 64) * 63
+
+ for i, data := range resourcesData {
+ candidate, err := w.compress(p, q, suffix32K(data))
if err != nil {
- return 0, nil, err
+ return 0, nil, 0, 0, err
+ }
+ if n := len(candidate); (n >= threshold) || (n >= len(compressed)) {
+ continue
+ }
+ secondaryResource = i
+
+ // As an optimization, if the final candidate is the winner, we don't
+ // have to copy its contents to w.stash.
+ if i == len(resourcesData)-1 {
+ compressed = candidate
+ } else {
+ w.stash = append(w.stash[:0], candidate...)
+ compressed = w.stash
+ }
+ }
+ return rac.CodecZlib, compressed, secondaryResource, rac.NoResourceUsed, nil
+}
+
+func (w *CodecWriter) compress(p []byte, q []byte, dict []byte) ([]byte, error) {
+ w.compressed.Reset()
+ zw, err := (*zlib.Writer)(nil), error(nil)
+ if len(dict) != 0 {
+ // A zlib.Writer doesn't have a Reset(etc, dict) method, so we have to
+ // allocate a new zlib.Writer.
+ zw, err = zlib.NewWriterLevelDict(&w.compressed, zlib.BestCompression, dict)
+ if err != nil {
+ return nil, err
+ }
+ } else if w.zlibWriter == nil {
+ zw, err = zlib.NewWriterLevelDict(&w.compressed, zlib.BestCompression, nil)
+ if err != nil {
+ return nil, err
}
w.zlibWriter = zw
} else {
- w.zlibWriter.Reset(&w.compressed)
+ zw = w.zlibWriter
+ zw.Reset(&w.compressed)
}
if len(p) > 0 {
- if _, err := w.zlibWriter.Write(p); err != nil {
- return 0, nil, err
+ if _, err := zw.Write(p); err != nil {
+ return nil, err
}
}
if len(q) > 0 {
- if _, err := w.zlibWriter.Write(q); err != nil {
- return 0, nil, err
+ if _, err := zw.Write(q); err != nil {
+ return nil, err
}
}
- if err := w.zlibWriter.Close(); err != nil {
- return 0, nil, err
+
+ if err := zw.Close(); err != nil {
+ return nil, err
}
- return rac.CodecZlib, w.compressed.Bytes(), nil
+ return w.compressed.Bytes(), nil
}
// Cut implements rac.CodecWriter.
@@ -205,3 +278,20 @@
}
return zlibcut.Cut(nil, encoded, maxEncodedLen)
}
+
+// WrapResource implements rac.CodecWriter.
+func (w *CodecWriter) WrapResource(raw []byte) ([]byte, error) {
+ // For a description of the RAC+Zlib secondary-data format, see
+ // https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md#rac--zlib
+ raw = suffix32K(raw)
+ wrapped := make([]byte, len(raw)+6)
+ wrapped[0] = uint8(len(raw) >> 0)
+ wrapped[1] = uint8(len(raw) >> 8)
+ copy(wrapped[2:], raw)
+ a := adler32.Checksum(raw)
+ wrapped[len(wrapped)-4] = uint8(a >> 24)
+ wrapped[len(wrapped)-3] = uint8(a >> 16)
+ wrapped[len(wrapped)-2] = uint8(a >> 8)
+ wrapped[len(wrapped)-1] = uint8(a >> 0)
+ return wrapped, nil
+}
diff --git a/lib/raczlib/raczlib_test.go b/lib/raczlib/raczlib_test.go
index d95374c..da6a4cb 100644
--- a/lib/raczlib/raczlib_test.go
+++ b/lib/raczlib/raczlib_test.go
@@ -17,9 +17,9 @@
import (
"bytes"
"encoding/binary"
+ "fmt"
"hash/crc32"
"io"
- "strings"
"testing"
"github.com/google/wuffs/lib/rac"
@@ -56,17 +56,42 @@
"\x28\x4A\x4D\x85\x71\x00\x01\x00\x00\xFF\xFF\x21\x6E\x04\x66"
)
-func testReader(t *testing.T, decoded string, encoded string) {
+func racCompress(original []byte, dChunkSize uint64, resourcesData [][]byte) ([]byte, error) {
+ buf := &bytes.Buffer{}
+ w := &rac.Writer{
+ Writer: buf,
+ CodecWriter: &CodecWriter{},
+ DChunkSize: dChunkSize,
+ ResourcesData: resourcesData,
+ }
+ if _, err := w.Write(original); err != nil {
+ return nil, fmt.Errorf("Write: %v", err)
+ }
+ if err := w.Close(); err != nil {
+ return nil, fmt.Errorf("Close: %v", err)
+ }
+ return buf.Bytes(), nil
+}
+
+func racDecompress(compressed []byte) ([]byte, error) {
buf := &bytes.Buffer{}
r := &rac.Reader{
- ReadSeeker: strings.NewReader(encoded),
- CompressedSize: int64(len(encoded)),
+ ReadSeeker: bytes.NewReader(compressed),
+ CompressedSize: int64(len(compressed)),
CodecReaders: []rac.CodecReader{&CodecReader{}},
}
if _, err := io.Copy(buf, r); err != nil {
- t.Fatalf("Copy: %v", err)
+ return nil, err
}
- if got, want := buf.String(), decoded; got != want {
+ return buf.Bytes(), nil
+}
+
+func testReader(t *testing.T, decoded string, encoded string) {
+ g, err := racDecompress([]byte(encoded))
+ if err != nil {
+ t.Fatalf("racDecompress: %v", err)
+ }
+ if got, want := string(g), decoded; got != want {
t.Fatalf("got:\n%s\nwant:\n%s", got, want)
}
}
@@ -151,19 +176,10 @@
func TestZeroedBytes(t *testing.T) {
original := []byte("abcde\x00\x00\x00\x00j")
- cBuf := &bytes.Buffer{}
- w := &rac.Writer{
- Writer: cBuf,
- CodecWriter: &CodecWriter{},
- DChunkSize: 8,
+ compressed, err := racCompress(original, 8, nil)
+ if err != nil {
+ t.Fatal(err)
}
- if _, err := w.Write(original); err != nil {
- t.Fatalf("Write: %v", err)
- }
- if err := w.Close(); err != nil {
- t.Fatalf("Close: %v", err)
- }
- compressed := cBuf.Bytes()
r := &rac.Reader{
ReadSeeker: bytes.NewReader(compressed),
@@ -191,3 +207,55 @@
}
}
}
+
+func TestSharedDictionary(t *testing.T) {
+ // Make some "dictionary" data that, as an independent chunk, does not
+ // compress very well.
+ const n = 256
+ dictionary := make([]byte, n)
+ for i := range dictionary {
+ dictionary[i] = uint8(i)
+ }
+
+ // Replicate it 32 times.
+ original := make([]byte, 0, 32*n)
+ for len(original) < 32*n {
+ original = append(original, dictionary...)
+ }
+
+ // Measure the RAC-compressed form of that replicated data, without and
+ // with a shared dictionary.
+ compressedLengths := [2]int{}
+ for i := range compressedLengths {
+ resourcesData := [][]byte{}
+ if i > 0 {
+ resourcesData = [][]byte{dictionary}
+ }
+
+ // Compress.
+ compressed, err := racCompress(original, n, resourcesData)
+ if err != nil {
+ t.Fatalf("i=%d: racCompress: %v", i, err)
+ }
+ if len(compressed) == 0 {
+ t.Fatalf("i=%d: compressed form is empty", i)
+ }
+ compressedLengths[i] = len(compressed)
+
+ // Decompress.
+ decompressed, err := racDecompress(compressed)
+ if err != nil {
+ t.Fatalf("i=%d: racDecompress: %v", i, err)
+ }
+ if !bytes.Equal(decompressed, original) {
+ t.Fatalf("i=%d: racDecompress: round trip did not match original", i)
+ }
+ }
+
+ // Using a shared dictionary should improve the compression ratio. The
+ // exact value depends on the Zlib compression algorithm, but we should
+ // expect at least a 4x improvement.
+ if ratio := compressedLengths[0] / compressedLengths[1]; ratio < 4 {
+ t.Fatalf("ratio: got %dx, want at least 4x", ratio)
+ }
+}