| // Copyright 2019 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // ---------------- |
| |
| // Package raczlib provides access to RAC (Random Access Compression) files |
| // with the Zlib compression codec. |
| // |
| // The RAC specification is at |
| // https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md |
| package raczlib |
| |
| import ( |
| "bytes" |
| "compress/zlib" |
| "errors" |
| "hash/adler32" |
| "io" |
| |
| "github.com/google/wuffs/lib/rac" |
| "github.com/google/wuffs/lib/zlibcut" |
| ) |
| |
| var ( |
| errInvalidCodec = errors.New("raczlib: invalid codec") |
| errInvalidDictionary = errors.New("raczlib: invalid dictionary") |
| ) |
| |
| func u32BE(b []byte) uint32 { |
| _ = b[3] // Early bounds check to guarantee safety of reads below. |
| return (uint32(b[0]) << 24) | (uint32(b[1]) << 16) | (uint32(b[2]) << 8) | (uint32(b[3])) |
| } |
| |
| // suffix32K returns the last 32 KiB of b, or if b is shorter than that, it |
| // returns just b. |
| // |
| // The Zlib format only supports up to 32 KiB of history or shared dictionary. |
| func suffix32K(b []byte) []byte { |
| const n = 32 * 1024 |
| if len(b) > n { |
| return b[len(b)-n:] |
| } |
| return b |
| } |
| |
| type resetReadCloser interface { |
| // Reset is the zlib.Resetter interface's sole method. |
| // |
| // We explicitly spell out the method signature, instead of just saying |
| // "zlib.Resetter". It may be possible for a future version of this package |
| // to not depend on the standard library's "compress/zlib" package at all. |
| Reset(r io.Reader, dict []byte) error |
| |
| io.ReadCloser |
| } |
| |
| // CodecReader specializes a rac.Reader to decode Zlib-compressed chunks. |
| type CodecReader struct { |
| // cachedZlibReader lets us re-use the memory allocated for a zlib reader, |
| // when decompressing multiple chunks. |
| cachedZlibReader resetReadCloser |
| |
| // These fields contain the most recently used shared dictionary. |
| cachedDictionary []byte |
| cachedDictionaryCRange rac.Range |
| |
| // buf is a scratch buffer. |
| buf [2]byte |
| } |
| |
| // Accepts implements rac.CodecReader. |
| func (r *CodecReader) Accepts(c rac.Codec) bool { |
| return c == rac.CodecZlib |
| } |
| |
| // Clone implements rac.CodecReader. |
| func (r *CodecReader) Clone() rac.CodecReader { |
| return &CodecReader{} |
| } |
| |
| // MakeReaderContext implements rac.CodecReader. |
| func (r *CodecReader) MakeReaderContext(rs io.ReadSeeker, chunk rac.Chunk) (rac.ReaderContext, error) { |
| // For a description of the RAC+Zlib secondary-data format, see |
| // https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md#rac--zlib |
| |
| if !chunk.CTertiary.Empty() { |
| return rac.ReaderContext{}, errInvalidDictionary |
| } |
| if chunk.CSecondary.Empty() { |
| return rac.ReaderContext{}, nil |
| } |
| cRange := chunk.CSecondary |
| |
| // Load from the MRU cache, if it was loaded from the same cRange. |
| if (cRange == r.cachedDictionaryCRange) && !cRange.Empty() { |
| return rac.ReaderContext{Secondary: r.cachedDictionary}, nil |
| } |
| |
| // Check the cRange size and the tTag. |
| if (cRange.Size() < 6) || (chunk.TTag != 0xFF) { |
| return rac.ReaderContext{}, errInvalidDictionary |
| } |
| |
| // Read the dictionary size. |
| if _, err := rs.Seek(cRange[0], io.SeekStart); err != nil { |
| return rac.ReaderContext{}, errInvalidDictionary |
| } |
| if _, err := io.ReadFull(rs, r.buf[:2]); err != nil { |
| return rac.ReaderContext{}, errInvalidDictionary |
| } |
| dictSize := int64(r.buf[0]) | (int64(r.buf[1]) << 8) |
| |
| // Check the size. The +6 is for the 2 byte prefix (dictionary size) and |
| // the 4 byte suffix (checksum). |
| if (dictSize + 6) > cRange.Size() { |
| return rac.ReaderContext{}, errInvalidDictionary |
| } |
| |
| // Allocate or re-use the cachedDictionary buffer. |
| buffer := []byte(nil) |
| if n := dictSize + 4; int64(cap(r.cachedDictionary)) >= n { |
| buffer = r.cachedDictionary[:n] |
| // Invalidate the cached dictionary, as we are re-using its memory. |
| r.cachedDictionaryCRange = rac.Range{} |
| } else { |
| buffer = make([]byte, n) |
| } |
| |
| // Read the dictionary and checksum. |
| if _, err := io.ReadFull(rs, buffer); err != nil { |
| return rac.ReaderContext{}, errInvalidDictionary |
| } |
| |
| // Verify the checksum. |
| dict, checksum := buffer[:dictSize], buffer[dictSize:] |
| if u32BE(checksum) != adler32.Checksum(dict) { |
| return rac.ReaderContext{}, errInvalidDictionary |
| } |
| |
| // Save to the MRU cache and return. |
| r.cachedDictionary = dict |
| r.cachedDictionaryCRange = cRange |
| return rac.ReaderContext{Secondary: dict}, nil |
| } |
| |
| // CodecWriter specializes a rac.Writer to encode Zlib-compressed chunks. |
| type CodecWriter struct { |
| // These fields help reduce memory allocation by re-using previous byte |
| // buffers or the zlib.Writer. |
| compressed bytes.Buffer |
| zlibWriter *zlib.Writer |
| stash []byte |
| } |
| |
| // Clone implements rac.CodecWriter. |
| func (r *CodecWriter) Clone() rac.CodecWriter { |
| return &CodecWriter{} |
| } |
| |
| // Compress implements rac.CodecWriter. |
| func (w *CodecWriter) Compress(p []byte, q []byte, resourcesData [][]byte) ( |
| codec rac.Codec, compressed []byte, secondaryResource int, tertiaryResource int, retErr error) { |
| |
| secondaryResource = rac.NoResourceUsed |
| |
| // Compress p+q without any shared dictionary. |
| baseline, err := w.compress(p, q, nil) |
| if err != nil { |
| return 0, nil, 0, 0, err |
| } |
| // If there aren't any potential shared dictionaries, or if the baseline |
| // compressed form is small, just return the baseline. |
| const minBaselineSizeToConsiderDictionaries = 256 |
| if (len(resourcesData) == 0) || (len(baseline) < minBaselineSizeToConsiderDictionaries) { |
| return rac.CodecZlib, baseline, secondaryResource, rac.NoResourceUsed, nil |
| } |
| |
| // w.stash keeps a copy of the best compression so far. Every call to |
| // w.compress can clobber the bytes previously returned by w.compress. |
| w.stash = append(w.stash[:0], baseline...) |
| compressed = w.stash |
| |
| // Only use a shared dictionary if it results in something smaller than |
| // 98.4375% (an arbitrary heuristic threshold) of the baseline. Otherwise, |
| // it's arguably not worth the additional complexity. |
| threshold := (len(baseline) / 64) * 63 |
| |
| for i, data := range resourcesData { |
| candidate, err := w.compress(p, q, suffix32K(data)) |
| if err != nil { |
| return 0, nil, 0, 0, err |
| } |
| if n := len(candidate); (n >= threshold) || (n >= len(compressed)) { |
| continue |
| } |
| secondaryResource = i |
| |
| // As an optimization, if the final candidate is the winner, we don't |
| // have to copy its contents to w.stash. |
| if i == len(resourcesData)-1 { |
| compressed = candidate |
| } else { |
| w.stash = append(w.stash[:0], candidate...) |
| compressed = w.stash |
| } |
| } |
| return rac.CodecZlib, compressed, secondaryResource, rac.NoResourceUsed, nil |
| } |
| |
| func (w *CodecWriter) compress(p []byte, q []byte, dict []byte) ([]byte, error) { |
| w.compressed.Reset() |
| zw, err := (*zlib.Writer)(nil), error(nil) |
| if len(dict) != 0 { |
| // A zlib.Writer doesn't have a Reset(etc, dict) method, so we have to |
| // allocate a new zlib.Writer. |
| zw, err = zlib.NewWriterLevelDict(&w.compressed, zlib.BestCompression, dict) |
| if err != nil { |
| return nil, err |
| } |
| } else if w.zlibWriter == nil { |
| zw, err = zlib.NewWriterLevelDict(&w.compressed, zlib.BestCompression, nil) |
| if err != nil { |
| return nil, err |
| } |
| w.zlibWriter = zw |
| } else { |
| zw = w.zlibWriter |
| zw.Reset(&w.compressed) |
| } |
| |
| if len(p) > 0 { |
| if _, err := zw.Write(p); err != nil { |
| return nil, err |
| } |
| } |
| if len(q) > 0 { |
| if _, err := zw.Write(q); err != nil { |
| return nil, err |
| } |
| } |
| |
| if err := zw.Close(); err != nil { |
| return nil, err |
| } |
| return w.compressed.Bytes(), nil |
| } |
| |
| // CanCut implements rac.CodecWriter. |
| func (w *CodecWriter) CanCut() bool { |
| return true |
| } |
| |
| // Cut implements rac.CodecWriter. |
| func (w *CodecWriter) Cut(codec rac.Codec, encoded []byte, maxEncodedLen int) (encodedLen int, decodedLen int, retErr error) { |
| if codec != rac.CodecZlib { |
| return 0, 0, errInvalidCodec |
| } |
| return zlibcut.Cut(nil, encoded, maxEncodedLen) |
| } |
| |
| // WrapResource implements rac.CodecWriter. |
| func (w *CodecWriter) WrapResource(raw []byte) ([]byte, error) { |
| // For a description of the RAC+Zlib secondary-data format, see |
| // https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md#rac--zlib |
| raw = suffix32K(raw) |
| wrapped := make([]byte, len(raw)+6) |
| wrapped[0] = uint8(len(raw) >> 0) |
| wrapped[1] = uint8(len(raw) >> 8) |
| copy(wrapped[2:], raw) |
| a := adler32.Checksum(raw) |
| wrapped[len(wrapped)-4] = uint8(a >> 24) |
| wrapped[len(wrapped)-3] = uint8(a >> 16) |
| wrapped[len(wrapped)-2] = uint8(a >> 8) |
| wrapped[len(wrapped)-1] = uint8(a >> 0) |
| return wrapped, nil |
| } |