Let rac.Writer support shared resources

commit: ac4fed2275089e8acf7932837da04313dd2b3449 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Fri Aug 09 09:56:23 2019 +1000
committer: Nigel Tao <nigeltao@golang.org> Fri Aug 09 13:33:09 2019 +1000
tree: 4b36166642e095eb35f722db1f589a8d61dee651
parent: 1cb1f1d5271bb74dace38d6cc2f8b20dd12c99ec [diff]
diff --git a/lib/rac/chunk_writer.go b/lib/rac/chunk_writer.go
index d66c8ed..73eeee9 100644
--- a/lib/rac/chunk_writer.go
+++ b/lib/rac/chunk_writer.go

@@ -43,6 +43,12 @@
 	b[7] = byte(v >> 56)
 }
 
+// OptResource is an option type, optionally holding a ChunkWriter-specific
+// identifier for a shared resource.
+//
+// Zero means that the option is not taken: no shared resource is used.
+type OptResource uint32
+
 // ChunkWriter provides a relatively simple way to write a RAC file - one that
 // is created starting from nothing, as opposed to incrementally modifying an
 // existing RAC file.

diff --git a/lib/rac/rac.go b/lib/rac/rac.go
index 4a97a18..cb7ed47 100644
--- a/lib/rac/rac.go
+++ b/lib/rac/rac.go

@@ -62,12 +62,6 @@
 
 var indexLocationAtEndMagic = []byte("\x72\xC3\x63\x00")
 
-// OptResource is an option type, optionally holding a Writer-specific
-// identifier for a shared resource.
-//
-// Zero means that the option is not taken: no shared resource is used.
-type OptResource uint32
-
 var (
 	errCChunkSizeIsTooSmall          = errors.New("rac: CChunkSize is too small")
 	errILAEndTempFile                = errors.New("rac: IndexLocationAtEnd requires a nil TempFile")

diff --git a/lib/rac/writer.go b/lib/rac/writer.go
index a69c140..1142b3f 100644
--- a/lib/rac/writer.go
+++ b/lib/rac/writer.go

@@ -127,6 +127,10 @@
 	b.curr = nil
 }
 
+// NoResourceUsed is returned by CodecWriter.Compress to indicate that no
+// secondary or tertiary resource was used in the compression.
+const NoResourceUsed int = -1
+
 // CodecWriter specializes a Writer to encode a specific compression codec.
 type CodecWriter interface {
 	// Compress returns the codec-specific compressed form of the concatenation
@@ -135,12 +139,19 @@
 	// The source bytes are conceptually one continuous data stream, but is
 	// presented in two slices to avoid unnecessary copying in the code that
 	// calls Compress. One or both of p and q may be an empty slice. A
-	// Compress(p, q) is equivalent to, but often more efficient than:
+	// Compress(p, q, etc) is equivalent to, but often more efficient than:
 	//   combined := []byte(nil)
 	//   combined = append(combined, p...)
 	//   combined = append(combined, q...)
-	//   Compress(combined, nil)
-	Compress(p []byte, q []byte) (Codec, []byte, error)
+	//   Compress(combined, nil, etc)
+	//
+	// Returning a secondaryResource or tertiaryResource within the range ((0
+	// <= i) && (i < len(resourcesData))) indicates that resourcesData[i] was
+	// used in the compression. A value outside of that range means that no
+	// resource was used in the secondary and/or tertiary slot. The
+	// NoResourceUsed constant (-1) is always outside that range.
+	Compress(p []byte, q []byte, resourcesData [][]byte) (
+		codec Codec, compressed []byte, secondaryResource int, tertiaryResource int, retErr error)
 
 	// Cut modifies encoded's contents such that encoded[:encodedLen] is valid
 	// codec-compressed data, assuming that encoded starts off containing valid
@@ -151,7 +162,13 @@
 	// Decompressing that modified, shorter byte slice produces a prefix (of
 	// length decodedLen) of the decompression of the original, longer byte
 	// slice.
-	Cut(codec Codec, encoded []byte, maxEncodedLen int) (encodedLen int, decodedLen int, retErr error)
+	Cut(codec Codec, encoded []byte, maxEncodedLen int) (
+		encodedLen int, decodedLen int, retErr error)
+
+	// WrapResource returns the Codec-specific encoding of the raw resource.
+	// For example, if raw is the bytes of a shared LZ77-style dictionary,
+	// WrapResource may prepend or append length and checksum data.
+	WrapResource(raw []byte) ([]byte, error)
 }
 
 // Writer provides a relatively simple way to write a RAC file - one that is
@@ -253,6 +270,22 @@
 	// will be used.
 	DChunkSize uint64
 
+	// ResourcesData is a list of resources that can be shared across otherwise
+	// independently compressed chunks.
+	//
+	// The exact semantics for a resource depends on the codec. For example,
+	// the Zlib codec interprets them as shared LZ77-style dictionaries.
+	//
+	// One way to generate shared dictionaries from sub-slices of a single
+	// source file is the command line tool at
+	// https://github.com/google/brotli/blob/master/research/dictionary_generator.cc
+	ResourcesData [][]byte
+
+	// resourcesIDs is the OptResource for each ResourcesData element. Zero
+	// means that corresponding resource is not yet used (and not yet written
+	// to the RAC file).
+	resourcesIDs []OptResource
+
 	// cChunkSize and dChunkSize are copies of CChunkSize and DChunkSize,
 	// validated during the initialize method. For example, if both CChunkSize
 	// and DChunkSize are zero, dChunkSize will be defaultDChunkSize.
@@ -287,6 +320,7 @@
 		w.err = errInvalidCodecWriter
 		return w.err
 	}
+	w.resourcesIDs = make([]OptResource, len(w.ResourcesData))
 
 	if w.DChunkSize > 0 {
 		w.dChunkSize = w.DChunkSize
@@ -306,6 +340,32 @@
 	return nil
 }
 
+// useResource marks a shared resource as used, if it wasn't already marked.
+//
+// i is the index into both the w.ResourcesData and w.resourcesIDs slices. It
+// is valid to pass an i outside the range [0, len(w.resourcesIDs)), in which
+// case the call is a no-op.
+func (w *Writer) useResource(i int) (OptResource, error) {
+	if (i < 0) || (len(w.resourcesIDs) < i) {
+		return 0, nil
+	}
+	if id := w.resourcesIDs[i]; id != 0 {
+		return id, nil
+	}
+	wrapped, err := w.CodecWriter.WrapResource(w.ResourcesData[i])
+	if err != nil {
+		w.err = err
+		return 0, err
+	}
+	id, err := w.chunkWriter.AddResource(wrapped)
+	if err != nil {
+		w.err = err
+		return 0, err
+	}
+	w.resourcesIDs[i] = id
+	return id, nil
+}
+
 // Write implements io.Writer.
 func (w *Writer) Write(p []byte) (int, error) {
 	if err := w.initialize(); err != nil {
@@ -342,12 +402,22 @@
 		if len(peek1) == 0 {
 			peek0 = stripTrailingZeroes(peek0)
 		}
-		codec, cBytes, err := w.CodecWriter.Compress(peek0, peek1)
+
+		codec, cBytes, index2, index3, err :=
+			w.CodecWriter.Compress(peek0, peek1, w.ResourcesData)
+		if err != nil {
+			return err
+		}
+		res2, err := w.useResource(index2)
+		if err != nil {
+			return err
+		}
+		res3, err := w.useResource(index3)
 		if err != nil {
 			return err
 		}
 
-		if err := w.chunkWriter.AddChunk(dSize, cBytes, 0, 0, codec); err != nil {
+		if err := w.chunkWriter.AddChunk(dSize, cBytes, res2, res3, codec); err != nil {
 			w.err = err
 			return err
 		}
@@ -402,7 +472,17 @@
 func (w *Writer) tryCChunk(targetDChunkSize uint64, force bool) error {
 	peek0, peek1 := w.uncompressed.peek(targetDChunkSize)
 	dSize := uint64(len(peek0)) + uint64(len(peek1))
-	codec, cBytes, err := w.CodecWriter.Compress(peek0, peek1)
+
+	codec, cBytes, index2, index3, err :=
+		w.CodecWriter.Compress(peek0, peek1, w.ResourcesData)
+	if err != nil {
+		return err
+	}
+	res2, err := w.useResource(index2)
+	if err != nil {
+		return err
+	}
+	res3, err := w.useResource(index3)
 	if err != nil {
 		return err
 	}
@@ -414,7 +494,7 @@
 		}
 		fallthrough
 	case uint64(len(cBytes)) == w.cChunkSize:
-		if err := w.chunkWriter.AddChunk(dSize, cBytes, 0, 0, codec); err != nil {
+		if err := w.chunkWriter.AddChunk(dSize, cBytes, res2, res3, codec); err != nil {
 			w.err = err
 			return err
 		}
@@ -431,7 +511,7 @@
 		w.err = errCChunkSizeIsTooSmall
 		return w.err
 	}
-	if err := w.chunkWriter.AddChunk(uint64(dLen), cBytes[:eLen], 0, 0, codec); err != nil {
+	if err := w.chunkWriter.AddChunk(uint64(dLen), cBytes[:eLen], res2, res3, codec); err != nil {
 		w.err = err
 		return err
 	}

diff --git a/lib/raczlib/raczlib.go b/lib/raczlib/raczlib.go
index 9fe9cdf..c1c3344 100644
--- a/lib/raczlib/raczlib.go
+++ b/lib/raczlib/raczlib.go

@@ -60,6 +60,18 @@
 	return err
 }
 
+// suffix32K returns the last 32 KiB of b, or if b is shorter than that, it
+// returns just b.
+//
+// The Zlib format only supports up to 32 KiB of history or shared dictionary.
+func suffix32K(b []byte) []byte {
+	const n = 32 * 1024
+	if len(b) > n {
+		return b[len(b)-n:]
+	}
+	return b
+}
+
 // CodecReader specializes a rac.Reader to decode Zlib-compressed chunks.
 type CodecReader struct {
 	// cachedZlibReader lets us re-use the memory allocated for a zlib reader,
@@ -164,38 +176,99 @@
 
 // CodecWriter specializes a rac.Writer to encode Zlib-compressed chunks.
 type CodecWriter struct {
+	// These fields help reduce memory allocation by re-using previous byte
+	// buffers or the zlib.Writer.
 	compressed bytes.Buffer
 	zlibWriter *zlib.Writer
+	stash      []byte
 }
 
 // Compress implements rac.CodecWriter.
-func (w *CodecWriter) Compress(p []byte, q []byte) (rac.Codec, []byte, error) {
-	w.compressed.Reset()
+func (w *CodecWriter) Compress(p []byte, q []byte, resourcesData [][]byte) (
+	codec rac.Codec, compressed []byte, secondaryResource int, tertiaryResource int, retErr error) {
 
-	if w.zlibWriter == nil {
-		zw, err := zlib.NewWriterLevel(&w.compressed, zlib.BestCompression)
+	secondaryResource = rac.NoResourceUsed
+
+	// Compress p+q without any shared dictionary.
+	baseline, err := w.compress(p, q, nil)
+	if err != nil {
+		return 0, nil, 0, 0, err
+	}
+	// If there aren't any potential shared dictionaries, or if the baseline
+	// compressed form is small, just return the baseline.
+	const minBaselineSizeToConsiderDictionaries = 256
+	if (len(resourcesData) == 0) || (len(baseline) < minBaselineSizeToConsiderDictionaries) {
+		return rac.CodecZlib, baseline, secondaryResource, rac.NoResourceUsed, nil
+	}
+
+	// w.stash keeps a copy of the best compression so far. Every call to
+	// w.compress can clobber the bytes previously returned by w.compress.
+	w.stash = append(w.stash[:0], baseline...)
+	compressed = w.stash
+
+	// Only use a shared dictionary if it results in something smaller than
+	// 98.4375% (an arbitrary heuristic threshold) of the baseline. Otherwise,
+	// it's arguably not worth the additional complexity.
+	threshold := (len(baseline) / 64) * 63
+
+	for i, data := range resourcesData {
+		candidate, err := w.compress(p, q, suffix32K(data))
 		if err != nil {
-			return 0, nil, err
+			return 0, nil, 0, 0, err
+		}
+		if n := len(candidate); (n >= threshold) || (n >= len(compressed)) {
+			continue
+		}
+		secondaryResource = i
+
+		// As an optimization, if the final candidate is the winner, we don't
+		// have to copy its contents to w.stash.
+		if i == len(resourcesData)-1 {
+			compressed = candidate
+		} else {
+			w.stash = append(w.stash[:0], candidate...)
+			compressed = w.stash
+		}
+	}
+	return rac.CodecZlib, compressed, secondaryResource, rac.NoResourceUsed, nil
+}
+
+func (w *CodecWriter) compress(p []byte, q []byte, dict []byte) ([]byte, error) {
+	w.compressed.Reset()
+	zw, err := (*zlib.Writer)(nil), error(nil)
+	if len(dict) != 0 {
+		// A zlib.Writer doesn't have a Reset(etc, dict) method, so we have to
+		// allocate a new zlib.Writer.
+		zw, err = zlib.NewWriterLevelDict(&w.compressed, zlib.BestCompression, dict)
+		if err != nil {
+			return nil, err
+		}
+	} else if w.zlibWriter == nil {
+		zw, err = zlib.NewWriterLevelDict(&w.compressed, zlib.BestCompression, nil)
+		if err != nil {
+			return nil, err
 		}
 		w.zlibWriter = zw
 	} else {
-		w.zlibWriter.Reset(&w.compressed)
+		zw = w.zlibWriter
+		zw.Reset(&w.compressed)
 	}
 
 	if len(p) > 0 {
-		if _, err := w.zlibWriter.Write(p); err != nil {
-			return 0, nil, err
+		if _, err := zw.Write(p); err != nil {
+			return nil, err
 		}
 	}
 	if len(q) > 0 {
-		if _, err := w.zlibWriter.Write(q); err != nil {
-			return 0, nil, err
+		if _, err := zw.Write(q); err != nil {
+			return nil, err
 		}
 	}
-	if err := w.zlibWriter.Close(); err != nil {
-		return 0, nil, err
+
+	if err := zw.Close(); err != nil {
+		return nil, err
 	}
-	return rac.CodecZlib, w.compressed.Bytes(), nil
+	return w.compressed.Bytes(), nil
 }
 
 // Cut implements rac.CodecWriter.
@@ -205,3 +278,20 @@
 	}
 	return zlibcut.Cut(nil, encoded, maxEncodedLen)
 }
+
+// WrapResource implements rac.CodecWriter.
+func (w *CodecWriter) WrapResource(raw []byte) ([]byte, error) {
+	// For a description of the RAC+Zlib secondary-data format, see
+	// https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md#rac--zlib
+	raw = suffix32K(raw)
+	wrapped := make([]byte, len(raw)+6)
+	wrapped[0] = uint8(len(raw) >> 0)
+	wrapped[1] = uint8(len(raw) >> 8)
+	copy(wrapped[2:], raw)
+	a := adler32.Checksum(raw)
+	wrapped[len(wrapped)-4] = uint8(a >> 24)
+	wrapped[len(wrapped)-3] = uint8(a >> 16)
+	wrapped[len(wrapped)-2] = uint8(a >> 8)
+	wrapped[len(wrapped)-1] = uint8(a >> 0)
+	return wrapped, nil
+}

diff --git a/lib/raczlib/raczlib_test.go b/lib/raczlib/raczlib_test.go
index d95374c..da6a4cb 100644
--- a/lib/raczlib/raczlib_test.go
+++ b/lib/raczlib/raczlib_test.go

@@ -17,9 +17,9 @@
 import (
 	"bytes"
 	"encoding/binary"
+	"fmt"
 	"hash/crc32"
 	"io"
-	"strings"
 	"testing"
 
 	"github.com/google/wuffs/lib/rac"
@@ -56,17 +56,42 @@
 		"\x28\x4A\x4D\x85\x71\x00\x01\x00\x00\xFF\xFF\x21\x6E\x04\x66"
 )
 
-func testReader(t *testing.T, decoded string, encoded string) {
+func racCompress(original []byte, dChunkSize uint64, resourcesData [][]byte) ([]byte, error) {
+	buf := &bytes.Buffer{}
+	w := &rac.Writer{
+		Writer:        buf,
+		CodecWriter:   &CodecWriter{},
+		DChunkSize:    dChunkSize,
+		ResourcesData: resourcesData,
+	}
+	if _, err := w.Write(original); err != nil {
+		return nil, fmt.Errorf("Write: %v", err)
+	}
+	if err := w.Close(); err != nil {
+		return nil, fmt.Errorf("Close: %v", err)
+	}
+	return buf.Bytes(), nil
+}
+
+func racDecompress(compressed []byte) ([]byte, error) {
 	buf := &bytes.Buffer{}
 	r := &rac.Reader{
-		ReadSeeker:     strings.NewReader(encoded),
-		CompressedSize: int64(len(encoded)),
+		ReadSeeker:     bytes.NewReader(compressed),
+		CompressedSize: int64(len(compressed)),
 		CodecReaders:   []rac.CodecReader{&CodecReader{}},
 	}
 	if _, err := io.Copy(buf, r); err != nil {
-		t.Fatalf("Copy: %v", err)
+		return nil, err
 	}
-	if got, want := buf.String(), decoded; got != want {
+	return buf.Bytes(), nil
+}
+
+func testReader(t *testing.T, decoded string, encoded string) {
+	g, err := racDecompress([]byte(encoded))
+	if err != nil {
+		t.Fatalf("racDecompress: %v", err)
+	}
+	if got, want := string(g), decoded; got != want {
 		t.Fatalf("got:\n%s\nwant:\n%s", got, want)
 	}
 }
@@ -151,19 +176,10 @@
 
 func TestZeroedBytes(t *testing.T) {
 	original := []byte("abcde\x00\x00\x00\x00j")
-	cBuf := &bytes.Buffer{}
-	w := &rac.Writer{
-		Writer:      cBuf,
-		CodecWriter: &CodecWriter{},
-		DChunkSize:  8,
+	compressed, err := racCompress(original, 8, nil)
+	if err != nil {
+		t.Fatal(err)
 	}
-	if _, err := w.Write(original); err != nil {
-		t.Fatalf("Write: %v", err)
-	}
-	if err := w.Close(); err != nil {
-		t.Fatalf("Close: %v", err)
-	}
-	compressed := cBuf.Bytes()
 
 	r := &rac.Reader{
 		ReadSeeker:     bytes.NewReader(compressed),
@@ -191,3 +207,55 @@
 		}
 	}
 }
+
+func TestSharedDictionary(t *testing.T) {
+	// Make some "dictionary" data that, as an independent chunk, does not
+	// compress very well.
+	const n = 256
+	dictionary := make([]byte, n)
+	for i := range dictionary {
+		dictionary[i] = uint8(i)
+	}
+
+	// Replicate it 32 times.
+	original := make([]byte, 0, 32*n)
+	for len(original) < 32*n {
+		original = append(original, dictionary...)
+	}
+
+	// Measure the RAC-compressed form of that replicated data, without and
+	// with a shared dictionary.
+	compressedLengths := [2]int{}
+	for i := range compressedLengths {
+		resourcesData := [][]byte{}
+		if i > 0 {
+			resourcesData = [][]byte{dictionary}
+		}
+
+		// Compress.
+		compressed, err := racCompress(original, n, resourcesData)
+		if err != nil {
+			t.Fatalf("i=%d: racCompress: %v", i, err)
+		}
+		if len(compressed) == 0 {
+			t.Fatalf("i=%d: compressed form is empty", i)
+		}
+		compressedLengths[i] = len(compressed)
+
+		// Decompress.
+		decompressed, err := racDecompress(compressed)
+		if err != nil {
+			t.Fatalf("i=%d: racDecompress: %v", i, err)
+		}
+		if !bytes.Equal(decompressed, original) {
+			t.Fatalf("i=%d: racDecompress: round trip did not match original", i)
+		}
+	}
+
+	// Using a shared dictionary should improve the compression ratio. The
+	// exact value depends on the Zlib compression algorithm, but we should
+	// expect at least a 4x improvement.
+	if ratio := compressedLengths[0] / compressedLengths[1]; ratio < 4 {
+		t.Fatalf("ratio: got %dx, want at least 4x", ratio)
+	}
+}
commit	ac4fed2275089e8acf7932837da04313dd2b3449	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Fri Aug 09 09:56:23 2019 +1000
committer	Nigel Tao <nigeltao@golang.org>	Fri Aug 09 13:33:09 2019 +1000
tree	4b36166642e095eb35f722db1f589a8d61dee651
parent	1cb1f1d5271bb74dace38d6cc2f8b20dd12c99ec [diff]