lib/raczlib/reader.go - external/github.com/google/wuffs - Git at Google

 // Copyright 2019 The Wuffs Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package raczlib

 import (
 	"compress/zlib"
 	"errors"
 	"io"

 	"github.com/google/wuffs/lib/rac"
 )

 var (
 	errInvalidCodec          = errors.New("raczlib: invalid Codec (expected rac.CodecZlib)")
 	errInvalidChunk          = errors.New("raczlib: invalid chunk")
 	errInvalidChunkTooLarge  = errors.New("raczlib: invalid chunk (too large)")
 	errInvalidChunkTruncated = errors.New("raczlib: invalid chunk (truncated)")
 	errInvalidReadSeeker     = errors.New("raczlib: invalid ReadSeeker")

 	errInternalInconsistentPosition = errors.New("raczlib: internal error: inconsistent position")
 )

 // Reader reads a RAC file.
 //
 // Do not modify its exported fields after calling any of its methods.
 type Reader struct {
 	// ReadSeeker is where the RAC-encoded data is read from.
 	//
 	// It may also implement io.ReaderAt, in which case its ReadAt method will
 	// be preferred over combining Read and Seek, as the former is presumably
 	// more efficient. This is optional: io.ReaderAt is a stronger contract
 	// than io.ReadSeeker, as multiple concurrent ReadAt calls must not
 	// interfere with each other.
 	//
 	// For example, this type itself only implements io.ReadSeeker, not
 	// io.ReaderAt, as it is not safe for concurrent use.
 	//
 	// Nil is an invalid value.
 	ReadSeeker io.ReadSeeker

 	// CompressedSize is the size of the RAC file.
 	//
 	// Zero is an invalid value, as an empty file is not a valid RAC file.
 	CompressedSize int64

 	// err is the first error encountered. It is sticky: once a non-nil error
 	// occurs, all public methods will return that error.
 	err error

 	// racReader is the low-level (Codec-agnostic) RAC reader.
 	racReader rac.Reader

 	// These two fields combine for a 3-state state machine:
 	//
 	//  - "State A" (both fields are zero): no RAC chunk is loaded.
 	//
 	//  - "State B" (zlibReader is non-zero, inImplicitZeroes is zero): a RAC
 	//    chunk is loaded, but not fully exhausted: decompressing the zlib
 	//    stream has not hit io.EOF yet.
 	//
 	//  - "State C" (zlibReader is zero, inImplicitZeroes is non-zero): a RAC
 	//    chunk was exhausted, and we now serve the implicit NUL bytes after a
 	//    chunk's explicitly encoded data. The number of NUL bytes can be (and
 	//    often is) zero.
 	//
 	// Calling Read may trigger state transitions (which form a cycle): "State
 	// A" -> "State B" -> "State C" -> "State A" -> "State B" -> etc.
 	//
 	// Calling Seek may reset the state machine to "State A".
 	//
 	// The initial state is "State A".
 	zlibReader       io.ReadCloser
 	inImplicitZeroes bool

 	// cachedZlibReader lets us re-use the memory allocated for a zlib reader,
 	// when decompressing multiple chunks.
 	cachedZlibReader zlib.Resetter

 	// currChunk is an io.Reader for the current chunk, used while in "State
 	// B". It serves zlib-compressed data, which the (non-nil) zlibReader turns
 	// into decompressed data.
 	currChunk io.LimitedReader

 	// pos is the current position, in DSpace. It is the base value when Seek
 	// is called with io.SeekCurrent.
 	pos int64

 	// dRange is, in "State B" and "State C", what part (in DSpace) of the
 	// current chunk has not yet been passed on (via this type's Read method).
 	//
 	// Within those states, dRange[0] increases over time, as parts of the
 	// chunk are decompressed and passed on, but dRange[1] does not change.
 	//
 	// An invariant is that ((dRange[0] <= pos) && (pos <= dRange[1])).
 	//
 	// If the first inequality is strict (i.e. dRange[0] < pos) then we have
 	// Seek'ed to a pos that is not a chunk boundary, and satisfying the Read
 	// method will first require decompressing and discarding some of the chunk
 	// data, until dRange[0] reaches pos.
 	//
 	// If the second inequality is strict (i.e. pos < dRange[1]) and we are in
 	// "State C" then we have a non-zero number of implicit NUL bytes left.
 	//
 	// In "State A", the dRange is empty and unused, other than trivially
 	// maintaining the invariant.
 	dRange rac.Range
 }

 func (r *Reader) initialize() error {
 	if r.err != nil {
 		return r.err
 	}
 	if r.racReader.ReadSeeker != nil {
 		// We're already initialized.
 		return nil
 	}
 	if r.ReadSeeker == nil {
 		r.err = errInvalidReadSeeker
 		return r.err
 	}

 	r.racReader.ReadSeeker = r.ReadSeeker
 	r.racReader.CompressedSize = r.CompressedSize
 	r.currChunk.R = r.ReadSeeker
 	return nil
 }

 // Read implements io.Reader.
 func (r *Reader) Read(p []byte) (int, error) {
 	if err := r.initialize(); err != nil {
 		return 0, err
 	}
 	numRead := 0

 	for len(p) > 0 {
 		if (r.pos < r.dRange[0]) || (r.dRange[1] < r.pos) {
 			r.err = errInternalInconsistentPosition
 			return numRead, r.err
 		}

 		readFunc := (func(*Reader, []byte) (int, error))(nil)
 		switch {
 		default: // "State A".
 			if err := r.nextChunk(); err != nil {
 				return numRead, err
 			}
 			continue

 		case r.zlibReader != nil: // "State B".
 			readFunc = (*Reader).readExplicitData

 		case r.inImplicitZeroes: // "State C".
 			readFunc = (*Reader).readImplicitZeroes
 		}

 		n, err := readFunc(r, p)
 		numRead += n
 		p = p[n:]
 		if err != nil {
 			return numRead, err
 		}
 	}
 	return numRead, nil
 }

 // readExplicitData serves the zlib-compressed data in a chunk.
 func (r *Reader) readExplicitData(p []byte) (int, error) {
 	// If the chunk started before r.pos, discard the opening bytes of the
 	// chunk's decompressed data.
 	for r.pos > r.dRange[0] {
 		discardBuffer := p
 		discardBufferLen := r.pos - r.dRange[0]
 		if int64(len(discardBuffer)) > discardBufferLen {
 			discardBuffer = discardBuffer[:discardBufferLen]
 		}

 		n, err := r.zlibReader.Read(discardBuffer)
 		r.dRange[0] += int64(n)
 		if err == io.EOF {
 			return n, r.transitionFromStateBToStateC()
 		}
 		if err != nil {
 			r.err = err
 			return 0, r.err
 		}
 	}

 	// Delegate to the zlib reader.
 	n, err := r.zlibReader.Read(p)
 	if size := r.dRange.Size(); int64(n) > size {
 		n = int(size)
 		err = errInvalidChunkTooLarge
 	}
 	r.pos += int64(n)
 	r.dRange[0] += int64(n)
 	if err == io.EOF {
 		return n, r.transitionFromStateBToStateC()
 	} else if err == io.ErrUnexpectedEOF {
 		err = errInvalidChunkTruncated
 	}
 	if err != nil {
 		r.err = err
 	}
 	return n, err
 }

 func (r *Reader) transitionFromStateBToStateC() error {
 	if err := r.zlibReader.Close(); err != nil {
 		if err == io.EOF {
 			err = io.ErrUnexpectedEOF
 		}
 		r.err = err
 		return r.err
 	}
 	r.zlibReader = nil
 	r.inImplicitZeroes = true
 	return nil
 }

 // readImplicitZeroes serves the implicit NUL bytes after a chunk's explicit
 // data. As
 // https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md#decompressing-a-leaf-node
 // says, "The Codec may produce fewer bytes than the DRange size. In that case,
 // the remaining bytes (in DSpace) are set to NUL (memset to zero)."
 func (r *Reader) readImplicitZeroes(p []byte) (int, error) {
 	// If the chunk's explicit data finished before r.pos, discard some of the
 	// implicit NULs.
 	if r.dRange[0] < r.pos {
 		r.dRange[0] = r.pos
 	}

 	// The next r.dRange.Size() bytes are all implicitly zero.
 	n := r.dRange.Size()
 	if int64(len(p)) > n {
 		p = p[:n]
 	}
 	for i := range p {
 		p[i] = 0
 	}

 	// Update the cursors, check for exhaustion and return.
 	r.pos += int64(len(p))
 	r.dRange[0] += int64(len(p))
 	if r.dRange.Empty() {
 		// Transition from "State C" to "State A".
 		r.inImplicitZeroes = false
 	}
 	return len(p), nil
 }

 // nextChunk loads the next independently compressed chunk. It transitions from
 // "State A" to "State B".
 //
 // It may return io.EOF, in which case the Reader stays in "State A", and the
 // r.err "sticky error" field stays nil.
 func (r *Reader) nextChunk() error {
 	chunk, err := r.racReader.NextChunk()
 	if err == io.EOF {
 		return io.EOF
 	} else if err != nil {
 		r.err = err
 		return r.err
 	}
 	if chunk.Codec != rac.CodecZlib {
 		r.err = errInvalidCodec
 		return r.err
 	}
 	if chunk.DRange.Empty() || chunk.CPrimary.Empty() || !chunk.CTertiary.Empty() {
 		r.err = errInvalidChunk
 		return r.err
 	}

 	dict := []byte(nil)
 	if !chunk.CSecondary.Empty() {
 		panic("TODO: dictionary support")
 	}

 	if _, err := r.ReadSeeker.Seek(chunk.CPrimary[0], io.SeekStart); err != nil {
 		if err == io.EOF {
 			err = io.ErrUnexpectedEOF
 		}
 		r.err = err
 		return r.err
 	}
 	r.currChunk.N = chunk.CPrimary.Size()
 	r.dRange = chunk.DRange

 	if r.cachedZlibReader != nil {
 		if err := r.cachedZlibReader.Reset(&r.currChunk, dict); err != nil {
 			if err == io.EOF {
 				err = io.ErrUnexpectedEOF
 			}
 			r.err = err
 			return r.err
 		}
 		r.zlibReader = r.cachedZlibReader.(io.ReadCloser)
 	} else {
 		r.zlibReader, err = zlib.NewReaderDict(&r.currChunk, dict)
 		if err != nil {
 			if err == io.EOF {
 				err = io.ErrUnexpectedEOF
 			}
 			r.err = err
 			return r.err
 		}
 		r.cachedZlibReader = r.zlibReader.(zlib.Resetter)
 	}
 	return nil
 }

 // Seek implements io.Seeker.
 func (r *Reader) Seek(offset int64, whence int) (int64, error) {
 	if err := r.initialize(); err != nil {
 		return 0, err
 	}

 	pos := r.pos
 	switch whence {
 	case io.SeekStart:
 		pos = offset
 	case io.SeekCurrent:
 		pos += offset
 	case io.SeekEnd:
 		end, err := r.racReader.DecompressedSize()
 		if err != nil {
 			r.err = err
 			return 0, r.err
 		}
 		pos = end + offset
 	default:
 		return 0, errors.New("raczlib.Reader.Seek: invalid whence")
 	}

 	if r.pos != pos {
 		if pos < 0 {
 			r.err = errors.New("raczlib.Reader.Seek: negative position")
 			return 0, r.err
 		}
 		if err := r.racReader.SeekToChunkContaining(pos); err != nil {
 			r.err = err
 			return 0, r.err
 		}
 		r.pos = pos

 		// Maintain the dRange/pos invariant.
 		r.dRange[0] = pos
 		r.dRange[1] = pos

 		// Reset to "State A".
 		r.zlibReader = nil
 		r.inImplicitZeroes = false
 	}
 	return r.pos, nil
 }
	// Copyright 2019 The Wuffs Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package raczlib

	import (
	"compress/zlib"
	"errors"
	"io"

	"github.com/google/wuffs/lib/rac"
	)

	var (
	errInvalidCodec = errors.New("raczlib: invalid Codec (expected rac.CodecZlib)")
	errInvalidChunk = errors.New("raczlib: invalid chunk")
	errInvalidChunkTooLarge = errors.New("raczlib: invalid chunk (too large)")
	errInvalidChunkTruncated = errors.New("raczlib: invalid chunk (truncated)")
	errInvalidReadSeeker = errors.New("raczlib: invalid ReadSeeker")

	errInternalInconsistentPosition = errors.New("raczlib: internal error: inconsistent position")
	)

	// Reader reads a RAC file.
	//
	// Do not modify its exported fields after calling any of its methods.
	type Reader struct {
	// ReadSeeker is where the RAC-encoded data is read from.
	//
	// It may also implement io.ReaderAt, in which case its ReadAt method will
	// be preferred over combining Read and Seek, as the former is presumably
	// more efficient. This is optional: io.ReaderAt is a stronger contract
	// than io.ReadSeeker, as multiple concurrent ReadAt calls must not
	// interfere with each other.
	//
	// For example, this type itself only implements io.ReadSeeker, not
	// io.ReaderAt, as it is not safe for concurrent use.
	//
	// Nil is an invalid value.
	ReadSeeker io.ReadSeeker

	// CompressedSize is the size of the RAC file.
	//
	// Zero is an invalid value, as an empty file is not a valid RAC file.
	CompressedSize int64

	// err is the first error encountered. It is sticky: once a non-nil error
	// occurs, all public methods will return that error.
	err error

	// racReader is the low-level (Codec-agnostic) RAC reader.
	racReader rac.Reader

	// These two fields combine for a 3-state state machine:
	//
	// - "State A" (both fields are zero): no RAC chunk is loaded.
	//
	// - "State B" (zlibReader is non-zero, inImplicitZeroes is zero): a RAC
	// chunk is loaded, but not fully exhausted: decompressing the zlib
	// stream has not hit io.EOF yet.
	//
	// - "State C" (zlibReader is zero, inImplicitZeroes is non-zero): a RAC
	// chunk was exhausted, and we now serve the implicit NUL bytes after a
	// chunk's explicitly encoded data. The number of NUL bytes can be (and
	// often is) zero.
	//
	// Calling Read may trigger state transitions (which form a cycle): "State
	// A" -> "State B" -> "State C" -> "State A" -> "State B" -> etc.
	//
	// Calling Seek may reset the state machine to "State A".
	//
	// The initial state is "State A".
	zlibReader io.ReadCloser
	inImplicitZeroes bool

	// cachedZlibReader lets us re-use the memory allocated for a zlib reader,
	// when decompressing multiple chunks.
	cachedZlibReader zlib.Resetter

	// currChunk is an io.Reader for the current chunk, used while in "State
	// B". It serves zlib-compressed data, which the (non-nil) zlibReader turns
	// into decompressed data.
	currChunk io.LimitedReader

	// pos is the current position, in DSpace. It is the base value when Seek
	// is called with io.SeekCurrent.
	pos int64

	// dRange is, in "State B" and "State C", what part (in DSpace) of the
	// current chunk has not yet been passed on (via this type's Read method).
	//
	// Within those states, dRange[0] increases over time, as parts of the
	// chunk are decompressed and passed on, but dRange[1] does not change.
	//
	// An invariant is that ((dRange[0] <= pos) && (pos <= dRange[1])).
	//
	// If the first inequality is strict (i.e. dRange[0] < pos) then we have
	// Seek'ed to a pos that is not a chunk boundary, and satisfying the Read
	// method will first require decompressing and discarding some of the chunk
	// data, until dRange[0] reaches pos.
	//
	// If the second inequality is strict (i.e. pos < dRange[1]) and we are in
	// "State C" then we have a non-zero number of implicit NUL bytes left.
	//
	// In "State A", the dRange is empty and unused, other than trivially
	// maintaining the invariant.
	dRange rac.Range
	}

	func (r *Reader) initialize() error {
	if r.err != nil {
	return r.err
	}
	if r.racReader.ReadSeeker != nil {
	// We're already initialized.
	return nil
	}
	if r.ReadSeeker == nil {
	r.err = errInvalidReadSeeker
	return r.err
	}

	r.racReader.ReadSeeker = r.ReadSeeker
	r.racReader.CompressedSize = r.CompressedSize
	r.currChunk.R = r.ReadSeeker
	return nil
	}

	// Read implements io.Reader.
	func (r *Reader) Read(p []byte) (int, error) {
	if err := r.initialize(); err != nil {
	return 0, err
	}
	numRead := 0

	for len(p) > 0 {
	if (r.pos < r.dRange[0]) \|\| (r.dRange[1] < r.pos) {
	r.err = errInternalInconsistentPosition
	return numRead, r.err
	}

	readFunc := (func(*Reader, []byte) (int, error))(nil)
	switch {
	default: // "State A".
	if err := r.nextChunk(); err != nil {
	return numRead, err
	}
	continue

	case r.zlibReader != nil: // "State B".
	readFunc = (*Reader).readExplicitData

	case r.inImplicitZeroes: // "State C".
	readFunc = (*Reader).readImplicitZeroes
	}

	n, err := readFunc(r, p)
	numRead += n
	p = p[n:]
	if err != nil {
	return numRead, err
	}
	}
	return numRead, nil
	}

	// readExplicitData serves the zlib-compressed data in a chunk.
	func (r *Reader) readExplicitData(p []byte) (int, error) {
	// If the chunk started before r.pos, discard the opening bytes of the
	// chunk's decompressed data.
	for r.pos > r.dRange[0] {
	discardBuffer := p
	discardBufferLen := r.pos - r.dRange[0]
	if int64(len(discardBuffer)) > discardBufferLen {
	discardBuffer = discardBuffer[:discardBufferLen]
	}

	n, err := r.zlibReader.Read(discardBuffer)
	r.dRange[0] += int64(n)
	if err == io.EOF {
	return n, r.transitionFromStateBToStateC()
	}
	if err != nil {
	r.err = err
	return 0, r.err
	}
	}

	// Delegate to the zlib reader.
	n, err := r.zlibReader.Read(p)
	if size := r.dRange.Size(); int64(n) > size {
	n = int(size)
	err = errInvalidChunkTooLarge
	}
	r.pos += int64(n)
	r.dRange[0] += int64(n)
	if err == io.EOF {
	return n, r.transitionFromStateBToStateC()
	} else if err == io.ErrUnexpectedEOF {
	err = errInvalidChunkTruncated
	}
	if err != nil {
	r.err = err
	}
	return n, err
	}

	func (r *Reader) transitionFromStateBToStateC() error {
	if err := r.zlibReader.Close(); err != nil {
	if err == io.EOF {
	err = io.ErrUnexpectedEOF
	}
	r.err = err
	return r.err
	}
	r.zlibReader = nil
	r.inImplicitZeroes = true
	return nil
	}

	// readImplicitZeroes serves the implicit NUL bytes after a chunk's explicit
	// data. As
	// https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md#decompressing-a-leaf-node
	// says, "The Codec may produce fewer bytes than the DRange size. In that case,
	// the remaining bytes (in DSpace) are set to NUL (memset to zero)."
	func (r *Reader) readImplicitZeroes(p []byte) (int, error) {
	// If the chunk's explicit data finished before r.pos, discard some of the
	// implicit NULs.
	if r.dRange[0] < r.pos {
	r.dRange[0] = r.pos
	}

	// The next r.dRange.Size() bytes are all implicitly zero.
	n := r.dRange.Size()
	if int64(len(p)) > n {
	p = p[:n]
	}
	for i := range p {
	p[i] = 0
	}

	// Update the cursors, check for exhaustion and return.
	r.pos += int64(len(p))
	r.dRange[0] += int64(len(p))
	if r.dRange.Empty() {
	// Transition from "State C" to "State A".
	r.inImplicitZeroes = false
	}
	return len(p), nil
	}

	// nextChunk loads the next independently compressed chunk. It transitions from
	// "State A" to "State B".
	//
	// It may return io.EOF, in which case the Reader stays in "State A", and the
	// r.err "sticky error" field stays nil.
	func (r *Reader) nextChunk() error {
	chunk, err := r.racReader.NextChunk()
	if err == io.EOF {
	return io.EOF
	} else if err != nil {
	r.err = err
	return r.err
	}
	if chunk.Codec != rac.CodecZlib {
	r.err = errInvalidCodec
	return r.err
	}
	if chunk.DRange.Empty() \|\| chunk.CPrimary.Empty() \|\| !chunk.CTertiary.Empty() {
	r.err = errInvalidChunk
	return r.err
	}

	dict := []byte(nil)
	if !chunk.CSecondary.Empty() {
	panic("TODO: dictionary support")
	}

	if _, err := r.ReadSeeker.Seek(chunk.CPrimary[0], io.SeekStart); err != nil {
	if err == io.EOF {
	err = io.ErrUnexpectedEOF
	}
	r.err = err
	return r.err
	}
	r.currChunk.N = chunk.CPrimary.Size()
	r.dRange = chunk.DRange

	if r.cachedZlibReader != nil {
	if err := r.cachedZlibReader.Reset(&r.currChunk, dict); err != nil {
	if err == io.EOF {
	err = io.ErrUnexpectedEOF
	}
	r.err = err
	return r.err
	}
	r.zlibReader = r.cachedZlibReader.(io.ReadCloser)
	} else {
	r.zlibReader, err = zlib.NewReaderDict(&r.currChunk, dict)
	if err != nil {
	if err == io.EOF {
	err = io.ErrUnexpectedEOF
	}
	r.err = err
	return r.err
	}
	r.cachedZlibReader = r.zlibReader.(zlib.Resetter)
	}
	return nil
	}

	// Seek implements io.Seeker.
	func (r *Reader) Seek(offset int64, whence int) (int64, error) {
	if err := r.initialize(); err != nil {
	return 0, err
	}

	pos := r.pos
	switch whence {
	case io.SeekStart:
	pos = offset
	case io.SeekCurrent:
	pos += offset
	case io.SeekEnd:
	end, err := r.racReader.DecompressedSize()
	if err != nil {
	r.err = err
	return 0, r.err
	}
	pos = end + offset
	default:
	return 0, errors.New("raczlib.Reader.Seek: invalid whence")
	}

	if r.pos != pos {
	if pos < 0 {
	r.err = errors.New("raczlib.Reader.Seek: negative position")
	return 0, r.err
	}
	if err := r.racReader.SeekToChunkContaining(pos); err != nil {
	r.err = err
	return 0, r.err
	}
	r.pos = pos

	// Maintain the dRange/pos invariant.
	r.dRange[0] = pos
	r.dRange[1] = pos

	// Reset to "State A".
	r.zlibReader = nil
	r.inImplicitZeroes = false
	}
	return r.pos, nil
	}