| // Copyright 2019 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // ---------------- |
| |
| //go:generate go run gen.go |
| |
| /* |
| ractool manipulates Random Access Compression (RAC) files. |
| |
| See the RAC specification for more details: |
| https://github.com/google/wuffs/blob/master/doc/spec/rac-spec.md |
| |
| Usage: |
| |
| ractool [flags] [input_filename] |
| |
| If no input_filename is given, stdin is used. Either way, output is written to |
| stdout. |
| |
| The flags should include exactly one of -decode or -encode. |
| |
| By default, a RAC file's chunks are decoded in parallel, using more total CPU |
| time to substantially reduce the real (wall clock) time taken. Batch (instead |
| of interactive) processing of many RAC files may want to pass -singlethreaded |
| to prefer minimizing total CPU time. |
| |
| When encoding, the input is partitioned into chunks and each chunk is |
| compressed independently. You can specify the target chunk size in terms of |
| either its compressed size or decompressed size. By default (if both |
| -cchunksize and -dchunksize are zero), a 64KiB -dchunksize is used. |
| |
| You can also specify a -cpagesize, which is similar to but not exactly the same |
| concept as alignment. If non-zero, padding is inserted into the output to |
| minimize the number of pages that each chunk occupies. Look for "CPageSize" in |
| the "package rac" documentation for more details: |
| https://godoc.org/github.com/google/wuffs/lib/rac |
| |
| A RAC file consists of an index and the chunks. The index may be either at the |
| start or at the end of the file. At the start results in slightly smaller and |
| slightly more efficient RAC files, but the encoding process needs more memory |
| or temporary disk space. |
| |
| Examples: |
| |
| ractool -decode foo.rac | sha256sum |
| ractool -decode -drange=400:500 foo.rac |
| ractool -encode foo.dat > foo.rac |
| ractool -encode -codec=zlib -dchunksize=256k foo.dat > foo.rac |
| |
| The "400:500" flag value means the 100 bytes ranging from a DSpace offset |
| (offset in terms of decompressed bytes, not compressed bytes) of 400 |
| (inclusive) to 500 (exclusive). Either or both bounds may be omitted, similar |
| to Go slice syntax. A "400:" flag value would mean ranging from 400 (inclusive) |
| to the end of the decompressed file. |
| |
| The "256k" flag value means 256 kibibytes (262144 bytes). Similarly, "1m" and |
| "1M" both mean 1 mebibyte (1048576 bytes). |
| |
| General Flags: |
| |
| -decode |
| whether to decode the input |
| -encode |
| whether to encode the input |
| |
| Decode-Related Flags: |
| |
| -drange |
| the "i:j" range to decompress, ":8" means the first 8 bytes |
| -singlethreaded |
| whether to decode on a single execution thread |
| |
| Encode-Related Flags: |
| |
| -cchunksize |
| the chunk size (in CSpace) |
| -codec |
| the compression codec (default "zlib") |
| -cpagesize |
| the page size (in CSpace) |
| -dchunksize |
| the chunk size (in DSpace) |
| -indexlocation |
| the index location, "start" or "end" (default "start") |
| -resources |
| comma-separated list of resource files, such as shared dictionaries |
| |
| Installation: |
| |
| Like any other implemented-in-Go program, to install the ractool program: |
| |
| go get github.com/google/wuffs/cmd/ractool |
| |
| Extended Example: |
| |
| -------- |
| $ # Fetch and unzip the enwik8 test file, a sample of Wikipedia. |
| $ wget http://mattmahoney.net/dc/enwik8.zip |
| $ unzip enwik8.zip |
| |
| $ # Create a shared dictionary. The dictionary_generator program |
| $ # comes from https://github.com/google/brotli |
| $ dictionary_generator --chunk_len=64k dict.dat enwik8 |
| |
| $ # RAC-encode it twice, with and without that shared dictionary. |
| $ ractool -encode -resources=dict.dat enwik8 > shared.rac |
| $ ractool -encode enwik8 > vanilla.rac |
| |
| $ # The size overhead (compared to the .zip) is about 2.4% or 4.8%, |
| $ # depending on whether we used a shared dictionary. |
| $ ls -l |
| total 207012 |
| -rw-r--r-- 1 tao tao 16384 Aug 9 19:12 dict.dat |
| -rw-r--r-- 1 tao tao 100000000 Jun 2 2011 enwik8 |
| -rw-r--r-- 1 tao tao 36445475 Sep 2 2011 enwik8.zip |
| -rw-r--r-- 1 tao tao 37320896 Aug 9 19:16 shared.rac |
| -rw-r--r-- 1 tao tao 38185178 Aug 9 19:17 vanilla.rac |
| |
| $ # Check that the decompressed forms all match. |
| $ cat enwik8 | sha256sum |
| 2b49720ec4d78c3c9fabaee6e4179a5e997302b3a70029f30f2d582218c024a8 - |
| $ unzip -p enwik8.zip | sha256sum |
| 2b49720ec4d78c3c9fabaee6e4179a5e997302b3a70029f30f2d582218c024a8 - |
| $ ractool -decode shared.rac | sha256sum |
| 2b49720ec4d78c3c9fabaee6e4179a5e997302b3a70029f30f2d582218c024a8 - |
| $ ractool -decode vanilla.rac | sha256sum |
| 2b49720ec4d78c3c9fabaee6e4179a5e997302b3a70029f30f2d582218c024a8 - |
| |
| $ # Compare how long it takes to produce 8 bytes from the middle of |
| $ # the decompressed file, which happens to be the word "Business". |
| $ time unzip -p enwik8.zip | dd if=/dev/stdin status=none \ |
| > iflag=skip_bytes,count_bytes skip=50000000 count=8 |
| Business |
| real 0m0.392s |
| user 0m0.407s |
| sys 0m0.118s |
| $ time ractool -decode -drange=50000000:50000008 shared.rac |
| Business |
| real 0m0.003s |
| user 0m0.004s |
| sys 0m0.000s |
| |
| $ # A RAC file's chunks can be decoded in parallel, unlike ZIP, |
| $ # substantially reducing the real (wall clock) time taken even |
| $ # though both of these files use DEFLATE (RFC 1951) compression. |
| $ time unzip -p enwik8.zip > /dev/null |
| real 0m0.737s |
| user 0m0.713s |
| sys 0m0.025s |
| $ time ractool -decode shared.rac > /dev/null |
| real 0m0.095s |
| user 0m1.316s |
| sys 0m0.069s |
| -------- |
| */ |
| package main |
| |
| import ( |
| "bytes" |
| "errors" |
| "flag" |
| "fmt" |
| "io" |
| "io/ioutil" |
| "os" |
| "runtime" |
| "strconv" |
| "strings" |
| |
| "github.com/google/wuffs/lib/rac" |
| "github.com/google/wuffs/lib/raczlib" |
| ) |
| |
| // TODO: a flag to use a disk-backed (not memory-backed) TempFile. |
| |
| var ( |
| decodeFlag = flag.Bool("decode", false, "whether to decode the input") |
| encodeFlag = flag.Bool("encode", false, "whether to encode the input") |
| |
| // Decode-related flags. |
| drangeFlag = flag.String("drange", ":", |
| "the \"i:j\" range to decompress, \":8\" means the first 8 bytes") |
| singlethreadedFlag = flag.Bool("singlethreaded", false, |
| "whether to decode on a single execution thread") |
| |
| // Encode-related flags. |
| codecFlag = flag.String("codec", "zlib", "the compression codec") |
| cpagesizeFlag = flag.String("cpagesize", "0", "the page size (in CSpace)") |
| cchunksizeFlag = flag.String("cchunksize", "0", "the chunk size (in CSpace)") |
| dchunksizeFlag = flag.String("dchunksize", "0", "the chunk size (in DSpace)") |
| indexlocationFlag = flag.String("indexlocation", "start", |
| "the index location, \"start\" or \"end\"") |
| resourcesFlag = flag.String("resources", "", |
| "comma-separated list of resource files, such as shared dictionaries") |
| ) |
| |
| func usage() { |
| os.Stderr.WriteString(usageStr) |
| } |
| |
| func main() { |
| if err := main1(); err != nil { |
| os.Stderr.WriteString(err.Error() + "\n") |
| os.Exit(1) |
| } |
| } |
| |
| func main1() error { |
| flag.Usage = usage |
| flag.Parse() |
| |
| inFile := os.Stdin |
| switch flag.NArg() { |
| case 0: |
| // No-op. |
| case 1: |
| f, err := os.Open(flag.Arg(0)) |
| if err != nil { |
| return err |
| } |
| defer f.Close() |
| inFile = f |
| default: |
| return errors.New("too many filenames; the maximum is one") |
| } |
| |
| if *decodeFlag && !*encodeFlag { |
| return decode(inFile) |
| } |
| if *encodeFlag && !*decodeFlag { |
| return encode(inFile) |
| } |
| return errors.New("must specify exactly one of -decode or -encode") |
| } |
| |
| // parseNumber converts strings like "3", "4k" and "0x50" to the integers 3, |
| // 4096 and 48. It returns a negative value if and only if an error is |
| // encountered. |
| func parseNumber(s string) int64 { |
| if s == "" { |
| return -1 |
| } |
| shift := uint32(0) |
| switch n := len(s) - 1; s[n] { |
| case 'k', 'K': |
| shift, s = 10, s[:n] |
| case 'm', 'M': |
| shift, s = 20, s[:n] |
| case 'g', 'G': |
| shift, s = 30, s[:n] |
| case 't', 'T': |
| shift, s = 40, s[:n] |
| case 'p', 'P': |
| shift, s = 50, s[:n] |
| case 'e', 'E': |
| shift, s = 60, s[:n] |
| } |
| i, err := strconv.ParseInt(s, 0, 64) |
| if (err != nil) || (i < 0) { |
| return -1 |
| } |
| const int64Max = (1 << 63) - 1 |
| if i > (int64Max >> shift) { |
| return -1 |
| } |
| return i << shift |
| } |
| |
| // parseRange parses a string like "1:23", returning i=1 and j=23. Either or |
| // both numbers can be missing, in which case i and/or j will be negative, and |
| // it is up to the caller to interpret that placeholder value meaningfully. |
| func parseRange(s string) (i int64, j int64, ok bool) { |
| n := strings.IndexByte(s, ':') |
| if n < 0 { |
| return 0, 0, false |
| } |
| |
| if n == 0 { |
| i = -1 |
| } else if i = parseNumber(s[:n]); i < 0 { |
| return 0, 0, false |
| } |
| |
| if n+1 >= len(s) { |
| j = -1 |
| } else if j = parseNumber(s[n+1:]); j < 0 { |
| return 0, 0, false |
| } |
| |
| if (i >= 0) && (j >= 0) && (i > j) { |
| return 0, 0, false |
| } |
| return i, j, true |
| } |
| |
| func decode(inFile *os.File) error { |
| switch *codecFlag { |
| case "zlib": |
| // No-op. |
| default: |
| return errors.New("unsupported -codec") |
| } |
| i, j, ok := parseRange(*drangeFlag) |
| if !ok { |
| return fmt.Errorf("invalid -drange") |
| } |
| |
| rs := io.ReadSeeker(inFile) |
| compressedSize, err := inFile.Seek(0, io.SeekEnd) |
| if err != nil { |
| // This seek-to-end error isn't fatal. The input might not actually be |
| // seekable, despite being an *os.File: "cat foo | ractool -decode". |
| // Instead, read all of the inFile into memory. |
| if inBytes, err := ioutil.ReadAll(inFile); err != nil { |
| return err |
| } else { |
| rs = bytes.NewReader(inBytes) |
| compressedSize = int64(len(inBytes)) |
| } |
| } |
| chunkReader := &rac.ChunkReader{ |
| ReadSeeker: rs, |
| CompressedSize: compressedSize, |
| } |
| decompressedSize, err := chunkReader.DecompressedSize() |
| if err != nil { |
| return err |
| } |
| if i < 0 { |
| i = 0 |
| } |
| if (j < 0) || (j > decompressedSize) { |
| j = decompressedSize |
| } |
| if i >= j { |
| return nil |
| } |
| |
| if *singlethreadedFlag { |
| racReader := mustMakeRACReader(rs, compressedSize) |
| return decodeSingleThreaded(os.Stdout, rac.Range{i, j}, racReader) |
| } |
| return decodeMultiThreaded(os.Stdout, rac.Range{i, j}, rs, compressedSize) |
| } |
| |
| func decodeMultiThreaded(dst io.Writer, overallDRange rac.Range, rs io.ReadSeeker, compressedSize int64) error { |
| numWorkers := runtime.NumCPU() |
| // After 16 workers, we seem to hit diminishing returns. |
| if numWorkers > 16 { |
| numWorkers = 16 |
| } |
| |
| // Set up re-usable buffers to hold decoded bytes. The number of buffers is |
| // arbitrary, but is larger than the number of workers because individual |
| // pieces of work (the decodeWork type) may complete in a different order |
| // than they need to be written to dst. Having spare buffers means fewer |
| // idle workers. |
| bufc := make(chan *bytes.Buffer, 2*numWorkers) |
| for i := 0; i < cap(bufc); i++ { |
| bufc <- &bytes.Buffer{} |
| } |
| |
| // Set up the workers, fed incomplete decodeWork's by issueDecodeWork. |
| reqc := make(chan decodeWork, numWorkers) |
| resc := make(chan decodeWork, numWorkers) |
| go issueDecodeWork(reqc, bufc, overallDRange, rs, compressedSize) |
| for i := 0; i < numWorkers; i++ { |
| go decodeWorker(resc, reqc, mustMakeRACReader(rs, compressedSize)) |
| } |
| |
| // m holds completed decodeWork's, which may arrive out of order. The next |
| // (in ascending DRange order) decodeWork starts at nextDRange0. |
| m := map[int64]*bytes.Buffer{} |
| nextDRange0 := overallDRange[0] |
| |
| // Handle completed decodeWork's. |
| for work := range resc { |
| // Add the next arrival to m (if non-empty), then pop all of m's |
| // entries that are next in ascending DRange order, returning any used |
| // buffers to bufc to be recycled. |
| if work.buf == nil { |
| // No-op. |
| } else if work.buf.Len() == 0 { |
| bufc <- work.buf |
| } else { |
| m[work.dRange[0]] = work.buf |
| for { |
| buf, ok := m[nextDRange0] |
| if !ok { |
| break |
| } |
| delete(m, nextDRange0) |
| n, err := dst.Write(buf.Bytes()) |
| if err != nil { |
| return err |
| } |
| nextDRange0 += int64(n) |
| bufc <- buf |
| } |
| } |
| |
| if work.err == errEndOfWork { |
| numWorkers-- |
| if numWorkers != 0 { |
| continue |
| } |
| return nil |
| } else if work.err != nil { |
| return work.err |
| } |
| |
| // This shouldn't happen, but if it does, print a specific error |
| // message instead of the more general "deadlock" message. |
| if len(m) == cap(bufc) { |
| return errors.New("internal error: all workers idle but the next chunk is not found") |
| } |
| } |
| panic("unreachable") |
| } |
| |
| var errEndOfWork = errors.New("end of work") |
| |
| // decodeWork is a unit of work for multi-threaded decoding. Workers receive |
| // empty buffers and send filled buffers. |
| type decodeWork struct { |
| dRange rac.Range |
| buf *bytes.Buffer |
| err error |
| } |
| |
| func decodeWorker(resc chan<- decodeWork, reqc <-chan decodeWork, racReader *rac.Reader) { |
| for work := range reqc { |
| if work.err == nil { |
| work.err = decodeSingleThreaded(work.buf, work.dRange, racReader) |
| } |
| resc <- work |
| if work.err != nil { |
| break |
| } |
| } |
| resc <- decodeWork{err: errEndOfWork} |
| } |
| |
| func issueDecodeWork(reqc chan<- decodeWork, bufc <-chan *bytes.Buffer, overallDRange rac.Range, rs io.ReadSeeker, compressedSize int64) { |
| defer close(reqc) |
| chunkReader := &rac.ChunkReader{ |
| ReadSeeker: rs, |
| CompressedSize: compressedSize, |
| } |
| if err := chunkReader.SeekToChunkContaining(overallDRange[0]); err != nil { |
| reqc <- decodeWork{err: err} |
| return |
| } |
| for { |
| chunk, err := chunkReader.NextChunk() |
| if err == io.EOF { |
| return |
| } else if err != nil { |
| reqc <- decodeWork{err: err} |
| return |
| } |
| if dr := chunk.DRange.Intersect(overallDRange); !dr.Empty() { |
| buf := <-bufc |
| buf.Reset() |
| reqc <- decodeWork{dRange: dr, buf: buf} |
| } |
| if chunk.DRange[1] >= overallDRange[1] { |
| return |
| } |
| } |
| } |
| |
| func decodeSingleThreaded(dst io.Writer, dRange rac.Range, racReader *rac.Reader) error { |
| if _, err := racReader.Seek(dRange[0], io.SeekStart); err != nil { |
| return err |
| } |
| _, err := io.Copy(dst, &io.LimitedReader{ |
| R: racReader, |
| N: dRange.Size(), |
| }) |
| return err |
| } |
| |
| func mustMakeRACReader(rs io.ReadSeeker, compressedSize int64) *rac.Reader { |
| r := &rac.Reader{ |
| ReadSeeker: rs, |
| CompressedSize: compressedSize, |
| } |
| switch *codecFlag { |
| case "zlib": |
| r.CodecReaders = []rac.CodecReader{&raczlib.CodecReader{}} |
| default: |
| panic("unreachable") |
| } |
| return r |
| } |
| |
| func encode(r io.Reader) error { |
| indexLocation := rac.IndexLocation(0) |
| switch *indexlocationFlag { |
| case "start": |
| indexLocation = rac.IndexLocationAtStart |
| case "end": |
| indexLocation = rac.IndexLocationAtEnd |
| default: |
| return errors.New("invalid -indexlocation") |
| } |
| |
| cchunksize := parseNumber(*cchunksizeFlag) |
| if cchunksize < 0 { |
| return errors.New("invalid -cchunksize") |
| } |
| cpagesize := parseNumber(*cpagesizeFlag) |
| if cpagesize < 0 { |
| return errors.New("invalid -cpagesize") |
| } |
| dchunksize := parseNumber(*dchunksizeFlag) |
| if dchunksize < 0 { |
| return errors.New("invalid -dchunksize") |
| } |
| |
| if (cchunksize != 0) && (dchunksize != 0) { |
| return errors.New("must specify none or one of -cchunksize or -dchunksize") |
| } else if (cchunksize == 0) && (dchunksize == 0) { |
| dchunksize = 65536 // 64 KiB. |
| } |
| |
| w := &rac.Writer{ |
| Writer: os.Stdout, |
| IndexLocation: indexLocation, |
| TempFile: &bytes.Buffer{}, |
| CPageSize: uint64(cpagesize), |
| CChunkSize: uint64(cchunksize), |
| DChunkSize: uint64(dchunksize), |
| } |
| switch *codecFlag { |
| case "zlib": |
| w.CodecWriter = &raczlib.CodecWriter{} |
| default: |
| return errors.New("unsupported -codec") |
| } |
| |
| if *resourcesFlag != "" { |
| for _, filename := range strings.Split(*resourcesFlag, ",") { |
| resource, err := ioutil.ReadFile(filename) |
| if err != nil { |
| return err |
| } |
| w.ResourcesData = append(w.ResourcesData, resource) |
| } |
| } |
| |
| if _, err := io.Copy(w, r); err != nil { |
| return err |
| } |
| return w.Close() |
| } |