Add a raczlib.Reader
diff --git a/lib/raczlib/example_test.go b/lib/raczlib/example_test.go
new file mode 100644
index 0000000..3a3e27f
--- /dev/null
+++ b/lib/raczlib/example_test.go
@@ -0,0 +1,106 @@
+// Copyright 2019 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package raczlib_test
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "log"
+
+ "github.com/google/wuffs/lib/raczlib"
+)
+
+// Example_roundTrip demonstrates compressing (using a raczlib.Writer) and
+// decompressing (using a raczlib.Reader). This includes decompressing an
+// excerpt of the original data, exercising the "random access" part of RAC.
+func Example_roundTrip() {
+ // Create some test data.
+ oBuf := &bytes.Buffer{}
+ for i := 99; i > 0; i-- {
+ fmt.Fprintf(oBuf, "%d bottles of beer on the wall, %d bottles of beer.\n"+
+ "Take one down, pass it around, %d bottles of beer on the wall.\n",
+ i, i, i-1)
+ }
+ original := oBuf.Bytes()
+
+ // Create the RAC file.
+ cBuf := &bytes.Buffer{}
+ w := &raczlib.Writer{
+ Writer: cBuf,
+ // It's not necessary to explicitly declare the DChunkSize. The zero
+ // value implies a reasonable default. Nonetheless, using a 1 KiB
+ // DChunkSize (which is relatively small) makes for a more interesting
+ // test, as the resultant RAC file then contains more than one chunk.
+ DChunkSize: 1024,
+ // We also use the default IndexLocation value, which makes for a
+ // simpler example, but if you're copy/pasting this code, note that
+ // using an explicit IndexLocationAtStart can result in slightly more
+ // efficient RAC files, at the cost of using more memory to encode.
+ }
+ if _, err := w.Write(original); err != nil {
+ log.Fatalf("Write: %v", err)
+ }
+ if err := w.Close(); err != nil {
+ log.Fatalf("Close: %v", err)
+ }
+ compressed := cBuf.Bytes()
+
+ // The exact compression ratio depends on the zlib encoder's algorithm,
+ // which can change across Go standard library releases, but it should be
+ // at least a 4x ratio. It'd be larger if we didn't specify an explicit
+ // (but relatively small) DChunkSize.
+ if ratio := len(original) / len(compressed); ratio < 4 {
+ log.Fatalf("compression ratio (%dx) was too small", ratio)
+ }
+
+ // Prepare to decompress.
+ r := &raczlib.Reader{
+ ReadSeeker: bytes.NewReader(compressed),
+ CompressedSize: int64(len(compressed)),
+ }
+
+ // Read the whole file.
+ wBuf := &bytes.Buffer{}
+ if _, err := io.Copy(wBuf, r); err != nil {
+ log.Fatal(err)
+ }
+ wholeFile := wBuf.Bytes()
+ if !bytes.Equal(wholeFile, original) {
+ log.Fatal("round trip did not preserve whole file")
+ } else {
+ fmt.Printf("Whole file preserved (%d bytes).\n", len(wholeFile))
+ }
+
+ // Read an excerpt.
+ const offset, length = 3000, 1200
+ want := original[offset : offset+length]
+ got := make([]byte, length)
+ if _, err := r.Seek(offset, io.SeekStart); err != nil {
+ log.Fatalf("Seek: %v", err)
+ }
+ if _, err := io.ReadFull(r, got); err != nil {
+ log.Fatalf("ReadFull: %v", err)
+ }
+ if !bytes.Equal(got, want) {
+ log.Fatal("round trip did not preserve excerpt")
+ } else {
+ fmt.Printf("Excerpt preserved (%d bytes).\n", len(got))
+ }
+
+ // Output:
+ // Whole file preserved (11357 bytes).
+ // Excerpt preserved (1200 bytes).
+}
diff --git a/lib/raczlib/reader.go b/lib/raczlib/reader.go
new file mode 100644
index 0000000..6096709
--- /dev/null
+++ b/lib/raczlib/reader.go
@@ -0,0 +1,366 @@
+// Copyright 2019 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package raczlib
+
+import (
+ "compress/zlib"
+ "errors"
+ "io"
+
+ "github.com/google/wuffs/lib/rac"
+)
+
+var (
+ errInvalidCodec = errors.New("raczlib: invalid Codec (expected rac.CodecZlib)")
+ errInvalidChunk = errors.New("raczlib: invalid chunk")
+ errInvalidChunkTooLarge = errors.New("raczlib: invalid chunk (too large)")
+ errInvalidChunkTruncated = errors.New("raczlib: invalid chunk (truncated)")
+ errInvalidReadSeeker = errors.New("raczlib: invalid ReadSeeker")
+
+ errInternalInconsistentPosition = errors.New("raczlib: internal error: inconsistent position")
+)
+
+// Reader reads a RAC file.
+//
+// Do not modify its exported fields after calling any of its methods.
+type Reader struct {
+ // ReadSeeker is where the RAC-encoded data is read from.
+ //
+ // It may also implement io.ReaderAt, in which case its ReadAt method will
+ // be preferred over combining Read and Seek, as the former is presumably
+ // more efficient. This is optional: io.ReaderAt is a stronger contract
+ // than io.ReadSeeker, as multiple concurrent ReadAt calls must not
+ // interfere with each other.
+ //
+ // For example, this type itself only implements io.ReadSeeker, not
+ // io.ReaderAt, as it is not safe for concurrent use.
+ //
+ // Nil is an invalid value.
+ ReadSeeker io.ReadSeeker
+
+ // CompressedSize is the size of the RAC file.
+ //
+ // Zero is an invalid value, as an empty file is not a valid RAC file.
+ CompressedSize int64
+
+ // err is the first error encountered. It is sticky: once a non-nil error
+ // occurs, all public methods will return that error.
+ err error
+
+ // racReader is the low-level (Codec-agnostic) RAC reader.
+ racReader rac.Reader
+
+ // These two fields combine for a 3-state state machine:
+ //
+ // - "State A" (both fields are zero): no RAC chunk is loaded.
+ //
+ // - "State B" (zlibReader is non-zero, inImplicitZeroes is zero): a RAC
+ // chunk is loaded, but not fully exhausted: decompressing the zlib
+ // stream has not hit io.EOF yet.
+ //
+ // - "State C" (zlibReader is zero, inImplicitZeroes is non-zero): a RAC
+ // chunk was exhausted, and we now serve the implicit NUL bytes after a
+ // chunk's explicitly encoded data. The number of NUL bytes can be (and
+ // often is) zero.
+ //
+ // Calling Read may trigger state transitions (which form a cycle): "State
+ // A" -> "State B" -> "State C" -> "State A" -> "State B" -> etc.
+ //
+ // Calling Seek may reset the state machine to "State A".
+ //
+ // The initial state is "State A".
+ zlibReader io.ReadCloser
+ inImplicitZeroes bool
+
+ // cachedZlibReader lets us re-use the memory allocated for a zlib reader,
+ // when decompressing multiple chunks.
+ cachedZlibReader zlib.Resetter
+
+ // currChunk is an io.Reader for the current chunk, used while in "State
+ // B". It serves zlib-compressed data, which the (non-nil) zlibReader turns
+ // into decompressed data.
+ currChunk io.LimitedReader
+
+ // pos is the current position, in DSpace. It is the base value when Seek
+ // is called with io.SeekCurrent.
+ pos int64
+
+ // dRange is, in "State B" and "State C", what part (in DSpace) of the
+ // current chunk has not yet been passed on (via this type's Read method).
+ //
+ // Within those states, dRange[0] increases over time, as parts of the
+ // chunk are decompressed and passed on, but dRange[1] does not change.
+ //
+ // An invariant is that ((dRange[0] <= pos) && (pos <= dRange[1])).
+ //
+ // If the first inequality is strict (i.e. dRange[0] < pos) then we have
+ // Seek'ed to a pos that is not a chunk boundary, and satisfying the Read
+ // method will first require decompressing and discarding some of the chunk
+ // data, until dRange[0] reaches pos.
+ //
+ // If the second inequality is strict (i.e. pos < dRange[1]) and we are in
+ // "State C" then we have a non-zero number of implicit NUL bytes left.
+ //
+ // In "State A", the dRange is empty and unused, other than trivially
+ // maintaining the invariant.
+ dRange rac.Range
+}
+
+func (r *Reader) initialize() error {
+ if r.err != nil {
+ return r.err
+ }
+ if r.racReader.ReadSeeker != nil {
+ // We're already initialized.
+ return nil
+ }
+ if r.ReadSeeker == nil {
+ r.err = errInvalidReadSeeker
+ return r.err
+ }
+
+ r.racReader.ReadSeeker = r.ReadSeeker
+ r.racReader.CompressedSize = r.CompressedSize
+ r.currChunk.R = r.ReadSeeker
+ return nil
+}
+
+// Read implements io.Reader.
+func (r *Reader) Read(p []byte) (int, error) {
+ if err := r.initialize(); err != nil {
+ return 0, err
+ }
+ numRead := 0
+
+ for len(p) > 0 {
+ if (r.pos < r.dRange[0]) || (r.dRange[1] < r.pos) {
+ r.err = errInternalInconsistentPosition
+ return numRead, r.err
+ }
+
+ readFunc := (func(*Reader, []byte) (int, error))(nil)
+ switch {
+ default: // "State A".
+ if err := r.nextChunk(); err != nil {
+ return numRead, err
+ }
+ continue
+
+ case r.zlibReader != nil: // "State B".
+ readFunc = (*Reader).readExplicitData
+
+ case r.inImplicitZeroes: // "State C".
+ readFunc = (*Reader).readImplicitZeroes
+ }
+
+ n, err := readFunc(r, p)
+ numRead += n
+ p = p[n:]
+ if err != nil {
+ return numRead, err
+ }
+ }
+ return numRead, nil
+}
+
+// readExplicitData serves the zlib-compressed data in a chunk.
+func (r *Reader) readExplicitData(p []byte) (int, error) {
+ // If the chunk started before r.pos, discard the opening bytes of the
+ // chunk's decompressed data.
+ for r.pos > r.dRange[0] {
+ discardBuffer := p
+ discardBufferLen := r.pos - r.dRange[0]
+ if int64(len(discardBuffer)) > discardBufferLen {
+ discardBuffer = discardBuffer[:discardBufferLen]
+ }
+
+ n, err := r.zlibReader.Read(discardBuffer)
+ r.dRange[0] += int64(n)
+ if err == io.EOF {
+ return n, r.transitionFromStateBToStateC()
+ }
+ if err != nil {
+ r.err = err
+ return 0, r.err
+ }
+ }
+
+ // Delegate to the zlib reader.
+ n, err := r.zlibReader.Read(p)
+ if size := r.dRange.Size(); int64(n) > size {
+ n = int(size)
+ err = errInvalidChunkTooLarge
+ }
+ r.pos += int64(n)
+ r.dRange[0] += int64(n)
+ if err == io.EOF {
+ return n, r.transitionFromStateBToStateC()
+ } else if err == io.ErrUnexpectedEOF {
+ err = errInvalidChunkTruncated
+ }
+ if err != nil {
+ r.err = err
+ }
+ return n, err
+}
+
+func (r *Reader) transitionFromStateBToStateC() error {
+ if err := r.zlibReader.Close(); err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ r.err = err
+ return r.err
+ }
+ r.zlibReader = nil
+ r.inImplicitZeroes = true
+ return nil
+}
+
+// readImplicitZeroes serves the implicit NUL bytes after a chunk's explicit
+// data. As
+// https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md#decompressing-a-leaf-node
+// says, "The Codec may produce fewer bytes than the DRange size. In that case,
+// the remaining bytes (in DSpace) are set to NUL (memset to zero)."
+func (r *Reader) readImplicitZeroes(p []byte) (int, error) {
+ // If the chunk's explicit data finished before r.pos, discard some of the
+ // implicit NULs.
+ if r.dRange[0] < r.pos {
+ r.dRange[0] = r.pos
+ }
+
+ // The next r.dRange.Size() bytes are all implicitly zero.
+ n := r.dRange.Size()
+ if int64(len(p)) > n {
+ p = p[:n]
+ }
+ for i := range p {
+ p[i] = 0
+ }
+
+ // Update the cursors, check for exhaustion and return.
+ r.pos += int64(len(p))
+ r.dRange[0] += int64(len(p))
+ if r.dRange.Empty() {
+ // Transition from "State C" to "State A".
+ r.inImplicitZeroes = false
+ }
+ return len(p), nil
+}
+
+// nextChunk loads the next independently compressed chunk. It transitions from
+// "State A" to "State B".
+//
+// It may return io.EOF, in which case the Reader stays in "State A", and the
+// r.err "sticky error" field stays nil.
+func (r *Reader) nextChunk() error {
+ chunk, err := r.racReader.NextChunk()
+ if err == io.EOF {
+ return io.EOF
+ } else if err != nil {
+ r.err = err
+ return r.err
+ }
+ if chunk.Codec != rac.CodecZlib {
+ r.err = errInvalidCodec
+ return r.err
+ }
+ if chunk.DRange.Empty() || chunk.CPrimary.Empty() || !chunk.CTertiary.Empty() {
+ r.err = errInvalidChunk
+ return r.err
+ }
+
+ dict := []byte(nil)
+ if !chunk.CSecondary.Empty() {
+ panic("TODO: dictionary support")
+ }
+
+ if _, err := r.ReadSeeker.Seek(chunk.CPrimary[0], io.SeekStart); err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ r.err = err
+ return r.err
+ }
+ r.currChunk.N = chunk.CPrimary.Size()
+ r.dRange = chunk.DRange
+
+ if r.cachedZlibReader != nil {
+ if err := r.cachedZlibReader.Reset(&r.currChunk, dict); err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ r.err = err
+ return r.err
+ }
+ r.zlibReader = r.cachedZlibReader.(io.ReadCloser)
+ } else {
+ r.zlibReader, err = zlib.NewReaderDict(&r.currChunk, dict)
+ if err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ r.err = err
+ return r.err
+ }
+ r.cachedZlibReader = r.zlibReader.(zlib.Resetter)
+ }
+ return nil
+}
+
+// Seek implements io.Seeker.
+func (r *Reader) Seek(offset int64, whence int) (int64, error) {
+ if err := r.initialize(); err != nil {
+ return 0, err
+ }
+
+ pos := r.pos
+ switch whence {
+ case io.SeekStart:
+ pos = offset
+ case io.SeekCurrent:
+ pos += offset
+ case io.SeekEnd:
+ end, err := r.racReader.DecompressedSize()
+ if err != nil {
+ r.err = err
+ return 0, r.err
+ }
+ pos = end + offset
+ default:
+ return 0, errors.New("raczlib.Reader.Seek: invalid whence")
+ }
+
+ if r.pos != pos {
+ if pos < 0 {
+ r.err = errors.New("raczlib.Reader.Seek: negative position")
+ return 0, r.err
+ }
+ if err := r.racReader.SeekToChunkContaining(pos); err != nil {
+ r.err = err
+ return 0, r.err
+ }
+ r.pos = pos
+
+ // Maintain the dRange/pos invariant.
+ r.dRange[0] = pos
+ r.dRange[1] = pos
+
+ // Reset to "State A".
+ r.zlibReader = nil
+ r.inImplicitZeroes = false
+ }
+ return r.pos, nil
+}