blob: e6bdfa253ca4df54bce392743e08c775a690831c [file] [log] [blame]
// Copyright 2019 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rac
import (
"hash/crc32"
"io"
"github.com/google/wuffs/lib/readerat"
)
func u48LE(b []byte) int64 {
_ = b[7] // Early bounds check to guarantee safety of reads below.
u := uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
const mask = (1 << 48) - 1
return int64(u & mask)
}
func u64LE(b []byte) uint64 {
_ = b[7] // Early bounds check to guarantee safety of reads below.
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
}
// Range is the half-open range [low, high). It is invalid for low to be
// greater than high.
type Range [2]int64
func (r Range) Empty() bool { return r[0] == r[1] }
func (r Range) Size() int64 { return r[1] - r[0] }
func (r Range) Intersect(s Range) Range {
if r[0] < s[0] {
r[0] = s[0]
}
if r[1] > s[1] {
r[1] = s[1]
}
if r[0] > r[1] {
return Range{}
}
return r
}
// Chunk is a compressed chunk returned by a ChunkReader.
//
// See the RAC specification for further discussion.
type Chunk struct {
DRange Range
CPrimary Range
CSecondary Range
CTertiary Range
STag uint8
TTag uint8
Codec Codec
}
// nodeSize returns the size (in CSpace) that a node with the given arity
// occupies.
func nodeSize(arity uint8) int {
return (16 * int(arity)) + 16
}
// rNode is the ChunkReader's representation of a node.
//
// None of its methods, other than valid, should be called unless valid returns
// true.
type rNode [4096]byte
func (b *rNode) arity() int { return int(b[3]) }
func (b *rNode) codecHasMixBit() bool { return b[(8*int(b[3]))+7]&0x40 != 0 }
func (b *rNode) cPtrMax() int64 { return u48LE(b[(16*int(b[3]))+8:]) }
func (b *rNode) dPtrMax() int64 { return u48LE(b[8*int(b[3]):]) }
func (b *rNode) version() uint8 { return b[(16*int(b[3]))+14] }
// codec returns the 1-byte (short) or 7-byte (long) codec as a valid Codec, or
// CodecInvalid.
//
// The (uint64) Codec value returned does not have the Mix Bit (the 62nd bit)
// set, regardless of whether it's set in the rNode's bytes.
func (b *rNode) codec() Codec {
arity := int(b[3])
cByte := b[(8*arity)+7]
// Look for a short codec.
if (cByte & 0x80) == 0 {
return Codec(cByte&0x3F) << 56
}
cByte &= 0x3F
// Look for a long codec.
const highBit, low56Bits = 0x8000000000000000, 0x00FFFFFFFFFFFFFF
for j := uint8(0); j < 4; j++ {
i := int(cByte | (j << 6))
if (i < arity) && (b.tTag(i) == 0xFD) {
base := (8 * arity) + 8
codec := Codec(u64LE(b[(8*i)+base:]))
return (codec & low56Bits) | highBit
}
}
return CodecInvalid
}
func (b *rNode) cLen(i int) uint8 {
base := (8 * int(b[3])) + 14
return b[(8*i)+base]
}
func (b *rNode) cOff(i int, cBias int64) int64 {
base := (8 * int(b[3])) + 8
return cBias + u48LE(b[(8*i)+base:])
}
func (b *rNode) cOffRange(i int, cBias int64) Range {
m := cBias + b.cPtrMax()
if i >= b.arity() {
return Range{m, m}
}
cOff := b.cOff(i, cBias)
if cLen := b.cLen(i); cLen != 0 {
if n := cOff + (int64(cLen) * 1024); m > n {
m = n
}
}
return Range{cOff, m}
}
func (b *rNode) dOff(i int, dBias int64) int64 {
if i == 0 {
return dBias
}
return dBias + u48LE(b[8*i:])
}
func (b *rNode) dOffRange(i int, dBias int64) Range {
return Range{b.dOff(i, dBias), b.dOff(i+1, dBias)}
}
func (b *rNode) dSize(i int) int64 {
x := int64(0)
if i > 0 {
x = u48LE(b[8*i:])
}
return u48LE(b[(8*i)+8:]) - x
}
func (b *rNode) sTag(i int) uint8 {
base := (8 * int(b[3])) + 15
return b[(8*i)+base]
}
func (b *rNode) tTag(i int) uint8 {
return b[(8*i)+7]
}
func (b *rNode) isLeaf(i int) bool {
return b[(8*i)+7] != 0xFE
}
// findChunkContaining returns the largest i < arity such that the i'th DOff is
// less than or equal to the dOff argument.
//
// The dOff argument must be non-negative and less than DOffMax.
func (b *rNode) findChunkContaining(dOff int64, dBias int64) int {
// Define f(x) to be whether the x'th DOff is greater than dOff, so that
// f(-1) is false and f(arity) is true. The DOff values are non-decreasing
// so that f(x) true implies f(x+1) true.
//
// Binary search with the invariant that f(lo-1) is false and f(hi) is true.
// This finds the smallest x such that f(x) is true.
lo, hi := 0, b.arity()
for lo < hi {
mid := int(uint(lo+hi) >> 1) // Avoid overflow when computing mid.
// lo ≤ mid < hi
if b.dOff(mid, dBias) <= dOff {
lo = mid + 1 // Preserves f(lo-1) == false.
} else {
hi = mid // Preserves f(hi) == true.
}
}
// lo == hi, f(lo-1) == false, and f(hi) (= f(lo)) == true. Therefore lo is
// the smallest x such that f(x) is true, so lo-1 is the largest x such
// that f(x) is false.
if lo <= 0 {
panic("rac: internal error: could not find containing chunk")
}
return lo - 1
}
func (b *rNode) chunk(i int, cBias int64, dBias int64) Chunk {
sTag := b.sTag(i)
tTag := b.tTag(i)
return Chunk{
DRange: b.dOffRange(i, dBias),
CPrimary: b.cOffRange(i, cBias),
CSecondary: b.cOffRange(int(sTag), cBias),
CTertiary: b.cOffRange(int(tTag), cBias),
STag: sTag,
TTag: tTag,
Codec: b.codec(),
}
}
func (b *rNode) valid() bool {
// Check the magic and arity.
if (b[0] != magic[0]) || (b[1] != magic[1]) || (b[2] != magic[2]) || (b[3] == 0) {
return false
}
arity := int(b[3])
size := (16 * arity) + 16
if b[3] != b[size-1] {
return false
}
// Check that the "Reserved (0)" bytes are zero and that the TTag values
// aren't in the reserved range [0xC0, 0xFD).
hasChildren := false
for i := 0; i < arity; i++ {
if b[(8*i)+6] != 0 {
return false
}
if tTag := b[(8*i)+7]; (0xC0 <= tTag) && (tTag < 0xFD) {
return false
} else if tTag != 0xFD {
hasChildren = true
}
}
if !hasChildren || (b[(8*arity)+6] != 0) {
return false
}
// Check that the DPtr values are non-decreasing. The first DPtr value is
// implicitly zero.
prev := int64(0)
for i := 1; i <= arity; i++ {
curr := u48LE(b[8*i:])
if curr < prev {
return false
} else if curr != prev {
if tTag := b[(8*i)+7]; tTag == 0xFD {
return false
}
}
prev = curr
}
// Check that no CPtr value exceeds CPtrMax (the final CPtr value), other
// than 0xFD Codec Entries.
base := (8 * arity) + 8
cPtrMax := u48LE(b[size-8:])
for i := 0; i < arity; i++ {
if cPtr := u48LE(b[(8*i)+base:]); cPtr <= cPtrMax {
continue
}
if tTag := b[(8*i)+7]; tTag == 0xFD {
continue
}
return false
}
// Check the the version is non-zero.
if b[(16*arity)+14] == 0 {
return false
}
// Check the checksum.
checksum := crc32.ChecksumIEEE(b[6:size])
checksum ^= checksum >> 16
if (b[4] != uint8(checksum>>0)) || (b[5] != uint8(checksum>>8)) {
return false
}
// Further checking of the codec, version, COffMax and DOffMax requires
// more context, and is done in loadAndValidate.
return b.codec().Valid()
}
// ChunkReader parses a RAC file.
//
// Do not modify its exported fields after calling any of its methods.
type ChunkReader struct {
// ReadSeeker is where the RAC-encoded data is read from.
//
// It may optionally implement io.ReaderAt, in which case its ReadAt method
// will be preferred and its Read and Seek methods will never be called.
// The ReadAt method is safe to use concurrently, so that multiple
// rac.Reader's can concurrently use the same source provided that the
// source (this field, nominally an io.ReadSeeker) implements io.ReaderAt.
//
// In particular, if the source is a bytes.Reader or an os.File, multiple
// rac.ChunkReader's can work in parallel, which can speed up decoding.
//
// This type itself only implements io.ReadSeeker, not io.ReaderAt, as it
// is not safe for concurrent use.
//
// Nil is an invalid value.
ReadSeeker io.ReadSeeker
// CompressedSize is the size of the RAC file in CSpace.
//
// Zero is an invalid value. The smallest valid RAC file is 32 bytes long.
CompressedSize int64
// initialized is set true after the first call on this ChunkReader.
initialized bool
// rootNodeArity is the root node's arity.
rootNodeArity uint8
// needToResolveSeekPosition is whether NextChunk will need to resolve
// seekPosition.
needToResolveSeekPosition bool
// readSeeker is either the same as the ReadSeeker field value, or it is
// that field value wrapped by a readerat.ReadSeeker to be safe to use
// concurrently.
readSeeker io.ReadSeeker
// err is the first error encountered. It is sticky: once a non-nil error
// occurs, all public methods will return that error.
err error
// decompressedSize is the size of the RAC file in DSpace.
decompressedSize int64
// rootNodeCOffset is the position of the root node in the RAC file.
rootNodeCOffset int64
// seekPosition gives, if needToResolveSeekPosition is true, the position
// in DSpace that NextChunk needs to find.
seekPosition int64
// The i (as in "the i'th child of currNode") that denotes the next chunk
// to be returned by NextChunk, if a non-empty chunk. If nextChunk equals
// currNode's arity, then currNode is exhausted and calling NextChunk will
// seek to the next node.
nextChunk int32
// The CBias and DBias of currNode.
currNodeCBias int64
currNodeDBias int64
// currNode is the 4096 byte buffer to hold the current node.
currNode rNode
}
func (r *ChunkReader) checkParameters() error {
if r.ReadSeeker == nil {
r.err = errInvalidReadSeeker
return r.err
}
// The smallest valid RAC file is 32 bytes long.
if r.CompressedSize < 32 {
r.err = errInvalidCompressedSize
return r.err
}
return nil
}
func (r *ChunkReader) initialize() error {
if r.err != nil {
return r.err
}
if r.initialized {
return nil
}
r.initialized = true
if err := r.checkParameters(); err != nil {
return err
}
if ra, ok := r.ReadSeeker.(io.ReaderAt); ok {
r.readSeeker = &readerat.ReadSeeker{
ReaderAt: ra,
Size: r.CompressedSize,
}
} else {
r.readSeeker = r.ReadSeeker
}
if err := r.findRootNode(); err != nil {
return err
}
if r.currNode.version() != 1 {
r.err = errUnsupportedRACFileVersion
return r.err
}
return nil
}
func (r *ChunkReader) findRootNode() error {
// Look at the start of the compressed file.
if _, err := r.readSeeker.Seek(0, io.SeekStart); err != nil {
r.err = err
return err
}
if _, err := io.ReadFull(r.readSeeker, r.currNode[:4]); err != nil {
r.err = err
return err
}
if (r.currNode[0] != magic[0]) ||
(r.currNode[1] != magic[1]) ||
(r.currNode[2] != magic[2]) {
r.err = errInvalidInputMissingMagicBytes
return r.err
}
if found, err := r.tryRootNode(r.currNode[3], false); err != nil {
return err
} else if found {
return nil
}
// Look at the end of the compressed file.
if _, err := r.readSeeker.Seek(r.CompressedSize-1, io.SeekStart); err != nil {
r.err = err
return err
}
if _, err := io.ReadFull(r.readSeeker, r.currNode[:1]); err != nil {
r.err = err
return err
}
if found, err := r.tryRootNode(r.currNode[0], true); err != nil {
return err
} else if found {
return nil
}
return errInvalidInputMissingRootNode
}
func (r *ChunkReader) tryRootNode(arity uint8, fromEnd bool) (found bool, ioErr error) {
if arity == 0 {
return false, nil
}
size := int64(nodeSize(arity))
if r.CompressedSize < size {
return false, nil
}
cOffset := int64(0)
if fromEnd {
cOffset = r.CompressedSize - size
}
if err := r.load(cOffset, arity); err != nil {
return false, err
}
if !r.currNode.valid() {
return false, nil
}
if r.currNode.cPtrMax() != r.CompressedSize {
return false, nil
}
r.needToResolveSeekPosition = true
r.rootNodeCOffset = cOffset
r.rootNodeArity = arity
r.decompressedSize = r.currNode.dPtrMax()
return true, nil
}
// load loads a node from the RAC file into r.currNode. It does not check that
// the result is valid, and the caller should do so if it doesn't already know
// that it is valid.
func (r *ChunkReader) load(cOffset int64, arity uint8) error {
if arity == 0 {
r.err = errInternalInconsistentArity
return r.err
}
size := nodeSize(arity)
if _, err := r.readSeeker.Seek(cOffset, io.SeekStart); err != nil {
r.err = err
return err
}
if _, err := io.ReadFull(r.readSeeker, r.currNode[:size]); err != nil {
r.err = err
return err
}
return nil
}
func (r *ChunkReader) loadAndValidate(cOffset int64,
parentCodec Codec, parentCodecHasMixBit bool, parentVersion uint8, parentCOffMax int64,
childCBias int64, childDSize int64) error {
if (cOffset < 0) || ((r.CompressedSize - 4) < cOffset) {
r.err = errInvalidIndexNode
return r.err
}
if _, err := r.readSeeker.Seek(cOffset, io.SeekStart); err != nil {
r.err = err
return err
}
if _, err := io.ReadFull(r.readSeeker, r.currNode[:4]); err != nil {
r.err = err
return err
}
arity := r.currNode[3]
if arity == 0 {
r.err = errInvalidIndexNode
return r.err
}
size := int64(nodeSize(arity))
if (r.CompressedSize < size) || ((r.CompressedSize - size) < cOffset) {
r.err = errInvalidIndexNode
return r.err
}
if err := r.load(cOffset, arity); err != nil {
return err
}
if !r.currNode.valid() {
r.err = errInvalidIndexNode
return r.err
}
// Validate the parent and child codec, version, COffMax and DOffMax.
childVersion := r.currNode.version()
if !parentChildCodecsValid(parentCodec, r.currNode.codec(), parentCodecHasMixBit) ||
(parentVersion < childVersion) ||
(parentCOffMax < (childCBias + r.currNode.cPtrMax())) ||
(childDSize != r.currNode.dPtrMax()) {
r.err = errInvalidIndexNode
return r.err
}
return nil
}
// DecompressedSize returns the total size of the decompressed data.
func (r *ChunkReader) DecompressedSize() (int64, error) {
if err := r.initialize(); err != nil {
return 0, err
}
return r.decompressedSize, nil
}
// SeekToChunkContaining sets up NextChunk to return the chunk containing
// dSpaceOffset. That chunk does not necessarily start at dSpaceOffset.
//
// It is an error to seek to a negative value.
func (r *ChunkReader) SeekToChunkContaining(dSpaceOffset int64) error {
if err := r.initialize(); err != nil {
return err
}
if dSpaceOffset < 0 {
r.err = errSeekToNegativePosition
return r.err
}
r.needToResolveSeekPosition = true
r.seekPosition = dSpaceOffset
return nil
}
// NextChunk returns the next independently compressed chunk, or io.EOF if
// there are no more chunks.
//
// Empty chunks (those that contain no decompressed data, only metadata) are
// skipped.
func (r *ChunkReader) NextChunk() (Chunk, error) {
if err := r.initialize(); err != nil {
return Chunk{}, err
}
for {
if r.needToResolveSeekPosition {
if r.seekPosition >= r.decompressedSize {
return Chunk{}, io.EOF
}
r.needToResolveSeekPosition = false
if err := r.resolveSeekPosition(); err != nil {
return Chunk{}, err
}
}
for n := int32(r.currNode.arity()); r.nextChunk < n; {
c := r.currNode.chunk(int(r.nextChunk), r.currNodeCBias, r.currNodeDBias)
r.nextChunk++
r.seekPosition = c.DRange[1]
if !c.DRange.Empty() {
return c, nil
}
}
r.needToResolveSeekPosition = true
}
}
func (r *ChunkReader) resolveSeekPosition() error {
// Load the root node. It has already been validated, during initialize.
if err := r.load(r.rootNodeCOffset, r.rootNodeArity); err != nil {
return err
}
// Walk the branch nodes until we find the leaf node containing the
// seekPosition.
cBias := int64(0)
dBias := int64(0)
for {
i := r.currNode.findChunkContaining(r.seekPosition, dBias)
if r.currNode.isLeaf(i) {
r.nextChunk = int32(i)
r.currNodeCBias = cBias
r.currNodeDBias = dBias
return nil
}
parentCodec := r.currNode.codec()
parentCodecHasMixBit := r.currNode.codecHasMixBit()
parentVersion := r.currNode.version()
parentCOffMax := cBias + r.currNode.cPtrMax()
childCOffset := r.currNode.cOff(i, cBias)
childCBias := cBias
if sTag := int(r.currNode.sTag(i)); sTag < r.currNode.arity() {
childCBias = r.currNode.cOff(sTag, cBias)
}
childDBias := r.currNode.dOff(i, dBias)
childDSize := r.currNode.dSize(i)
if err := r.loadAndValidate(childCOffset,
parentCodec, parentCodecHasMixBit, parentVersion, parentCOffMax,
childCBias, childDSize); err != nil {
return err
}
cBias = childCBias
dBias = childDBias
}
}