Add a raczlib.Reader

commit: 566e8e04240a2f7b6f9591b4a40502da4aef580c [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Mon Aug 05 15:42:56 2019 +1000
committer: Nigel Tao <nigeltao@golang.org> Mon Aug 05 15:42:56 2019 +1000
tree: 6ed3379d74150cfa526caf4f4a7fae31db1f4af4
parent: 774744d2f32410c673a75dfe4175fde5094b8049 [diff]
diff --git a/lib/raczlib/example_test.go b/lib/raczlib/example_test.go
new file mode 100644
index 0000000..3a3e27f
--- /dev/null
+++ b/lib/raczlib/example_test.go

@@ -0,0 +1,106 @@
+// Copyright 2019 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package raczlib_test
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"log"
+
+	"github.com/google/wuffs/lib/raczlib"
+)
+
+// Example_roundTrip demonstrates compressing (using a raczlib.Writer) and
+// decompressing (using a raczlib.Reader). This includes decompressing an
+// excerpt of the original data, exercising the "random access" part of RAC.
+func Example_roundTrip() {
+	// Create some test data.
+	oBuf := &bytes.Buffer{}
+	for i := 99; i > 0; i-- {
+		fmt.Fprintf(oBuf, "%d bottles of beer on the wall, %d bottles of beer.\n"+
+			"Take one down, pass it around, %d bottles of beer on the wall.\n",
+			i, i, i-1)
+	}
+	original := oBuf.Bytes()
+
+	// Create the RAC file.
+	cBuf := &bytes.Buffer{}
+	w := &raczlib.Writer{
+		Writer: cBuf,
+		// It's not necessary to explicitly declare the DChunkSize. The zero
+		// value implies a reasonable default. Nonetheless, using a 1 KiB
+		// DChunkSize (which is relatively small) makes for a more interesting
+		// test, as the resultant RAC file then contains more than one chunk.
+		DChunkSize: 1024,
+		// We also use the default IndexLocation value, which makes for a
+		// simpler example, but if you're copy/pasting this code, note that
+		// using an explicit IndexLocationAtStart can result in slightly more
+		// efficient RAC files, at the cost of using more memory to encode.
+	}
+	if _, err := w.Write(original); err != nil {
+		log.Fatalf("Write: %v", err)
+	}
+	if err := w.Close(); err != nil {
+		log.Fatalf("Close: %v", err)
+	}
+	compressed := cBuf.Bytes()
+
+	// The exact compression ratio depends on the zlib encoder's algorithm,
+	// which can change across Go standard library releases, but it should be
+	// at least a 4x ratio. It'd be larger if we didn't specify an explicit
+	// (but relatively small) DChunkSize.
+	if ratio := len(original) / len(compressed); ratio < 4 {
+		log.Fatalf("compression ratio (%dx) was too small", ratio)
+	}
+
+	// Prepare to decompress.
+	r := &raczlib.Reader{
+		ReadSeeker:     bytes.NewReader(compressed),
+		CompressedSize: int64(len(compressed)),
+	}
+
+	// Read the whole file.
+	wBuf := &bytes.Buffer{}
+	if _, err := io.Copy(wBuf, r); err != nil {
+		log.Fatal(err)
+	}
+	wholeFile := wBuf.Bytes()
+	if !bytes.Equal(wholeFile, original) {
+		log.Fatal("round trip did not preserve whole file")
+	} else {
+		fmt.Printf("Whole file preserved (%d bytes).\n", len(wholeFile))
+	}
+
+	// Read an excerpt.
+	const offset, length = 3000, 1200
+	want := original[offset : offset+length]
+	got := make([]byte, length)
+	if _, err := r.Seek(offset, io.SeekStart); err != nil {
+		log.Fatalf("Seek: %v", err)
+	}
+	if _, err := io.ReadFull(r, got); err != nil {
+		log.Fatalf("ReadFull: %v", err)
+	}
+	if !bytes.Equal(got, want) {
+		log.Fatal("round trip did not preserve excerpt")
+	} else {
+		fmt.Printf("Excerpt    preserved  (%d bytes).\n", len(got))
+	}
+
+	// Output:
+	// Whole file preserved (11357 bytes).
+	// Excerpt    preserved  (1200 bytes).
+}

diff --git a/lib/raczlib/reader.go b/lib/raczlib/reader.go
new file mode 100644
index 0000000..6096709
--- /dev/null
+++ b/lib/raczlib/reader.go

@@ -0,0 +1,366 @@
+// Copyright 2019 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package raczlib
+
+import (
+	"compress/zlib"
+	"errors"
+	"io"
+
+	"github.com/google/wuffs/lib/rac"
+)
+
+var (
+	errInvalidCodec          = errors.New("raczlib: invalid Codec (expected rac.CodecZlib)")
+	errInvalidChunk          = errors.New("raczlib: invalid chunk")
+	errInvalidChunkTooLarge  = errors.New("raczlib: invalid chunk (too large)")
+	errInvalidChunkTruncated = errors.New("raczlib: invalid chunk (truncated)")
+	errInvalidReadSeeker     = errors.New("raczlib: invalid ReadSeeker")
+
+	errInternalInconsistentPosition = errors.New("raczlib: internal error: inconsistent position")
+)
+
+// Reader reads a RAC file.
+//
+// Do not modify its exported fields after calling any of its methods.
+type Reader struct {
+	// ReadSeeker is where the RAC-encoded data is read from.
+	//
+	// It may also implement io.ReaderAt, in which case its ReadAt method will
+	// be preferred over combining Read and Seek, as the former is presumably
+	// more efficient. This is optional: io.ReaderAt is a stronger contract
+	// than io.ReadSeeker, as multiple concurrent ReadAt calls must not
+	// interfere with each other.
+	//
+	// For example, this type itself only implements io.ReadSeeker, not
+	// io.ReaderAt, as it is not safe for concurrent use.
+	//
+	// Nil is an invalid value.
+	ReadSeeker io.ReadSeeker
+
+	// CompressedSize is the size of the RAC file.
+	//
+	// Zero is an invalid value, as an empty file is not a valid RAC file.
+	CompressedSize int64
+
+	// err is the first error encountered. It is sticky: once a non-nil error
+	// occurs, all public methods will return that error.
+	err error
+
+	// racReader is the low-level (Codec-agnostic) RAC reader.
+	racReader rac.Reader
+
+	// These two fields combine for a 3-state state machine:
+	//
+	//  - "State A" (both fields are zero): no RAC chunk is loaded.
+	//
+	//  - "State B" (zlibReader is non-zero, inImplicitZeroes is zero): a RAC
+	//    chunk is loaded, but not fully exhausted: decompressing the zlib
+	//    stream has not hit io.EOF yet.
+	//
+	//  - "State C" (zlibReader is zero, inImplicitZeroes is non-zero): a RAC
+	//    chunk was exhausted, and we now serve the implicit NUL bytes after a
+	//    chunk's explicitly encoded data. The number of NUL bytes can be (and
+	//    often is) zero.
+	//
+	// Calling Read may trigger state transitions (which form a cycle): "State
+	// A" -> "State B" -> "State C" -> "State A" -> "State B" -> etc.
+	//
+	// Calling Seek may reset the state machine to "State A".
+	//
+	// The initial state is "State A".
+	zlibReader       io.ReadCloser
+	inImplicitZeroes bool
+
+	// cachedZlibReader lets us re-use the memory allocated for a zlib reader,
+	// when decompressing multiple chunks.
+	cachedZlibReader zlib.Resetter
+
+	// currChunk is an io.Reader for the current chunk, used while in "State
+	// B". It serves zlib-compressed data, which the (non-nil) zlibReader turns
+	// into decompressed data.
+	currChunk io.LimitedReader
+
+	// pos is the current position, in DSpace. It is the base value when Seek
+	// is called with io.SeekCurrent.
+	pos int64
+
+	// dRange is, in "State B" and "State C", what part (in DSpace) of the
+	// current chunk has not yet been passed on (via this type's Read method).
+	//
+	// Within those states, dRange[0] increases over time, as parts of the
+	// chunk are decompressed and passed on, but dRange[1] does not change.
+	//
+	// An invariant is that ((dRange[0] <= pos) && (pos <= dRange[1])).
+	//
+	// If the first inequality is strict (i.e. dRange[0] < pos) then we have
+	// Seek'ed to a pos that is not a chunk boundary, and satisfying the Read
+	// method will first require decompressing and discarding some of the chunk
+	// data, until dRange[0] reaches pos.
+	//
+	// If the second inequality is strict (i.e. pos < dRange[1]) and we are in
+	// "State C" then we have a non-zero number of implicit NUL bytes left.
+	//
+	// In "State A", the dRange is empty and unused, other than trivially
+	// maintaining the invariant.
+	dRange rac.Range
+}
+
+func (r *Reader) initialize() error {
+	if r.err != nil {
+		return r.err
+	}
+	if r.racReader.ReadSeeker != nil {
+		// We're already initialized.
+		return nil
+	}
+	if r.ReadSeeker == nil {
+		r.err = errInvalidReadSeeker
+		return r.err
+	}
+
+	r.racReader.ReadSeeker = r.ReadSeeker
+	r.racReader.CompressedSize = r.CompressedSize
+	r.currChunk.R = r.ReadSeeker
+	return nil
+}
+
+// Read implements io.Reader.
+func (r *Reader) Read(p []byte) (int, error) {
+	if err := r.initialize(); err != nil {
+		return 0, err
+	}
+	numRead := 0
+
+	for len(p) > 0 {
+		if (r.pos < r.dRange[0]) || (r.dRange[1] < r.pos) {
+			r.err = errInternalInconsistentPosition
+			return numRead, r.err
+		}
+
+		readFunc := (func(*Reader, []byte) (int, error))(nil)
+		switch {
+		default: // "State A".
+			if err := r.nextChunk(); err != nil {
+				return numRead, err
+			}
+			continue
+
+		case r.zlibReader != nil: // "State B".
+			readFunc = (*Reader).readExplicitData
+
+		case r.inImplicitZeroes: // "State C".
+			readFunc = (*Reader).readImplicitZeroes
+		}
+
+		n, err := readFunc(r, p)
+		numRead += n
+		p = p[n:]
+		if err != nil {
+			return numRead, err
+		}
+	}
+	return numRead, nil
+}
+
+// readExplicitData serves the zlib-compressed data in a chunk.
+func (r *Reader) readExplicitData(p []byte) (int, error) {
+	// If the chunk started before r.pos, discard the opening bytes of the
+	// chunk's decompressed data.
+	for r.pos > r.dRange[0] {
+		discardBuffer := p
+		discardBufferLen := r.pos - r.dRange[0]
+		if int64(len(discardBuffer)) > discardBufferLen {
+			discardBuffer = discardBuffer[:discardBufferLen]
+		}
+
+		n, err := r.zlibReader.Read(discardBuffer)
+		r.dRange[0] += int64(n)
+		if err == io.EOF {
+			return n, r.transitionFromStateBToStateC()
+		}
+		if err != nil {
+			r.err = err
+			return 0, r.err
+		}
+	}
+
+	// Delegate to the zlib reader.
+	n, err := r.zlibReader.Read(p)
+	if size := r.dRange.Size(); int64(n) > size {
+		n = int(size)
+		err = errInvalidChunkTooLarge
+	}
+	r.pos += int64(n)
+	r.dRange[0] += int64(n)
+	if err == io.EOF {
+		return n, r.transitionFromStateBToStateC()
+	} else if err == io.ErrUnexpectedEOF {
+		err = errInvalidChunkTruncated
+	}
+	if err != nil {
+		r.err = err
+	}
+	return n, err
+}
+
+func (r *Reader) transitionFromStateBToStateC() error {
+	if err := r.zlibReader.Close(); err != nil {
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
+		r.err = err
+		return r.err
+	}
+	r.zlibReader = nil
+	r.inImplicitZeroes = true
+	return nil
+}
+
+// readImplicitZeroes serves the implicit NUL bytes after a chunk's explicit
+// data. As
+// https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md#decompressing-a-leaf-node
+// says, "The Codec may produce fewer bytes than the DRange size. In that case,
+// the remaining bytes (in DSpace) are set to NUL (memset to zero)."
+func (r *Reader) readImplicitZeroes(p []byte) (int, error) {
+	// If the chunk's explicit data finished before r.pos, discard some of the
+	// implicit NULs.
+	if r.dRange[0] < r.pos {
+		r.dRange[0] = r.pos
+	}
+
+	// The next r.dRange.Size() bytes are all implicitly zero.
+	n := r.dRange.Size()
+	if int64(len(p)) > n {
+		p = p[:n]
+	}
+	for i := range p {
+		p[i] = 0
+	}
+
+	// Update the cursors, check for exhaustion and return.
+	r.pos += int64(len(p))
+	r.dRange[0] += int64(len(p))
+	if r.dRange.Empty() {
+		// Transition from "State C" to "State A".
+		r.inImplicitZeroes = false
+	}
+	return len(p), nil
+}
+
+// nextChunk loads the next independently compressed chunk. It transitions from
+// "State A" to "State B".
+//
+// It may return io.EOF, in which case the Reader stays in "State A", and the
+// r.err "sticky error" field stays nil.
+func (r *Reader) nextChunk() error {
+	chunk, err := r.racReader.NextChunk()
+	if err == io.EOF {
+		return io.EOF
+	} else if err != nil {
+		r.err = err
+		return r.err
+	}
+	if chunk.Codec != rac.CodecZlib {
+		r.err = errInvalidCodec
+		return r.err
+	}
+	if chunk.DRange.Empty() || chunk.CPrimary.Empty() || !chunk.CTertiary.Empty() {
+		r.err = errInvalidChunk
+		return r.err
+	}
+
+	dict := []byte(nil)
+	if !chunk.CSecondary.Empty() {
+		panic("TODO: dictionary support")
+	}
+
+	if _, err := r.ReadSeeker.Seek(chunk.CPrimary[0], io.SeekStart); err != nil {
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
+		r.err = err
+		return r.err
+	}
+	r.currChunk.N = chunk.CPrimary.Size()
+	r.dRange = chunk.DRange
+
+	if r.cachedZlibReader != nil {
+		if err := r.cachedZlibReader.Reset(&r.currChunk, dict); err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			r.err = err
+			return r.err
+		}
+		r.zlibReader = r.cachedZlibReader.(io.ReadCloser)
+	} else {
+		r.zlibReader, err = zlib.NewReaderDict(&r.currChunk, dict)
+		if err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			r.err = err
+			return r.err
+		}
+		r.cachedZlibReader = r.zlibReader.(zlib.Resetter)
+	}
+	return nil
+}
+
+// Seek implements io.Seeker.
+func (r *Reader) Seek(offset int64, whence int) (int64, error) {
+	if err := r.initialize(); err != nil {
+		return 0, err
+	}
+
+	pos := r.pos
+	switch whence {
+	case io.SeekStart:
+		pos = offset
+	case io.SeekCurrent:
+		pos += offset
+	case io.SeekEnd:
+		end, err := r.racReader.DecompressedSize()
+		if err != nil {
+			r.err = err
+			return 0, r.err
+		}
+		pos = end + offset
+	default:
+		return 0, errors.New("raczlib.Reader.Seek: invalid whence")
+	}
+
+	if r.pos != pos {
+		if pos < 0 {
+			r.err = errors.New("raczlib.Reader.Seek: negative position")
+			return 0, r.err
+		}
+		if err := r.racReader.SeekToChunkContaining(pos); err != nil {
+			r.err = err
+			return 0, r.err
+		}
+		r.pos = pos
+
+		// Maintain the dRange/pos invariant.
+		r.dRange[0] = pos
+		r.dRange[1] = pos
+
+		// Reset to "State A".
+		r.zlibReader = nil
+		r.inImplicitZeroes = false
+	}
+	return r.pos, nil
+}
commit	566e8e04240a2f7b6f9591b4a40502da4aef580c	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Mon Aug 05 15:42:56 2019 +1000
committer	Nigel Tao <nigeltao@golang.org>	Mon Aug 05 15:42:56 2019 +1000
tree	6ed3379d74150cfa526caf4f4a7fae31db1f4af4
parent	774744d2f32410c673a75dfe4175fde5094b8049 [diff]