| #!python3 |
| """Program to dump contents of Brotli compressed files showing the compression format. |
| Jurjen N.E. Bos, 2016. |
| I found the following issues with the Brotli format: |
| - The distance alphabet has size 16+(48<<POSTFIX), |
| but the last symbols are useless. |
| It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter. |
| - The block type code is useless if NBLTYPES==2, you would only need 1 symbol |
| anyway, so why don't you just switch to "the other" type? |
| """ |
| import struct |
| from operator import itemgetter, methodcaller |
| from itertools import accumulate, repeat |
| from collections import defaultdict, deque |
| from functools import partial |
| |
| class InvalidStream(Exception): pass |
| #lookup table |
| L, I, D = "literal", "insert©", "distance" |
| pL, pI, pD = 'P'+L, 'P'+I, 'P'+D |
| |
| def outputCharFormatter(c): |
| """Show character in readable format |
| """ |
| #TODO 2: allow hex only output |
| if 32<c<127: return chr(c) |
| elif c==10: return '\\n' |
| elif c==13: return '\\r' |
| elif c==32: return '" "' |
| else: return '\\x{:02x}'.format(c) |
| |
| def outputFormatter(s): |
| """Show string or char. |
| """ |
| result = '' |
| def formatSubString(s): |
| for c in s: |
| if c==32: yield ' ' |
| else: yield outputCharFormatter(c) |
| if len(result)<200: return ''.join(formatSubString(s)) |
| else: |
| return ''.join(formatSubString(s[:100]))+'...'+ \ |
| ''.join(formatSubString(s[-100:])) |
| |
| |
| class BitStream: |
| """Represent a bytes object. Can read bits and prefix codes the way |
| Brotli does. |
| """ |
| def __init__(self, byteString): |
| self.data = byteString |
| #position in bits: byte pos is pos>>3, bit pos is pos&7 |
| self.pos = 0 |
| |
| def __repr__(self): |
| """Representation |
| >>> olleke |
| BitStream(pos=0:0) |
| """ |
| return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7) |
| |
| def read(self, n): |
| """Read n bits from the stream and return as an integer. |
| Produces zero bits beyond the stream. |
| >>> olleke.data[0]==27 |
| True |
| >>> olleke.read(5) |
| 27 |
| |
| >>> olleke |
| BitStream(pos=0:5) |
| """ |
| value = self.peek(n) |
| self.pos += n |
| if self.pos>len(self.data)*8: |
| raise ValueError('Read past end of stream') |
| return value |
| |
| def peek(self, n): |
| """Peek an n bit integer from the stream without updating the pointer. |
| It is not an error to read beyond the end of the stream. |
| >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803 |
| True |
| >>> olleke.peek(15) |
| 11803 |
| >>> hex(olleke.peek(32)) |
| '0x2e1b' |
| """ |
| #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3] |
| #convert to int: int.from_bytes(..., 'little') |
| #shift out the bits from the first byte: >>(self.pos&7) |
| #mask unwanted bits: & (1<<n)-1 |
| return int.from_bytes( |
| self.data[self.pos>>3:self.pos+n+7>>3], |
| 'little')>>(self.pos&7) & (1<<n)-1 |
| |
| def readBytes(self, n): |
| """Read n bytes from the stream on a byte boundary. |
| """ |
| if self.pos&7: raise ValueError('readBytes: need byte boundary') |
| result = self.data[self.pos>>3:(self.pos>>3)+n] |
| self.pos += 8*n |
| return result |
| |
| #-----------------------Symbol------------------------------------------- |
| class Symbol: |
| """A symbol in a code. |
| Refers back to the code that contains it. |
| Index is the place in the alphabet of the symbol. |
| """ |
| __slots__ = 'code', 'index' |
| def __init__(self, code, value): |
| self.code = code |
| self.index = value |
| |
| def __repr__(self): |
| return 'Symbol({}, {})'.format(self.code.name, self.index) |
| |
| def __len__(self): |
| """Number of bits in the prefix notation of this symbol |
| """ |
| return self.code.length(self.index) |
| |
| def __int__(self): |
| return self.index |
| |
| #these routines call equivalent routine in Code class |
| def bitPattern(self): |
| """Value of the symbol in the stream |
| """ |
| return self.code.bitPattern(self.index) |
| |
| def extraBits(self): |
| """Number of extra bits to read for this symbol |
| """ |
| return self.code.extraBits(self.index) |
| |
| def __str__(self): |
| """Short descriptor of the symbol without extra bits. |
| """ |
| return self.code.mnemonic(self.index) |
| |
| #requiring optional extra bits, if self.code supports them |
| def value(self, extra=None): |
| """The value used for processing. Can be a tuple. |
| with optional extra bits |
| """ |
| if isinstance(self.code, WithExtra): |
| if not 0<=extra<1<<self.extraBits(): |
| raise ValueError("value: extra value doesn't fit in extraBits") |
| return self.code.value(self.index, extra) |
| if extra is not None: |
| raise ValueError('value: no extra bits for this code') |
| return self.code.value(self.index) |
| |
| def explanation(self, extra=None): |
| """Long explanation of the value from the numeric value |
| with optional extra bits |
| Used by Layout.verboseRead when printing the value |
| """ |
| if isinstance(self.code, WithExtra): |
| return self.code.callback(self, extra) |
| return self.code.callback(self) |
| |
| #========================Code definitions================================== |
| class RangeDecoder: |
| """A decoder for the Code class that assumes the symbols |
| are encoded consecutively in binary. |
| It all depends on the "alphabetSize" property. |
| The range runs from 0 to alphabetSize-1. |
| This is the default decoder. |
| """ |
| def __init__(self, *, alphabetSize=None, bitLength=None, **args): |
| if bitLength is not None: alphabetSize = 1<<bitLength |
| if alphabetSize is not None: |
| self.alphabetSize = alphabetSize |
| self.maxLength = (alphabetSize-1).bit_length() |
| |
| def __len__(self): |
| return self.alphabetSize |
| |
| def __iter__(self): |
| """Produce all symbols. |
| """ |
| return map(partial(Symbol, self), range(len(self))) |
| |
| def __getitem__(self, index): |
| if index>=self.alphabetSize: raise ValueError('index out of range') |
| return Symbol(self, index) |
| |
| def bitPattern(self, index): |
| return '{:0{}b}'.format(index, self.maxLength) |
| |
| def length(self, index): |
| """Encoding length of given symbol. |
| Does not depend on index in this case. |
| """ |
| return self.maxLength |
| |
| def decodePeek(self, data): |
| """Find which symbol index matches the given data (from peek, as a number) |
| and return the number of bits decoded. |
| Can also be used to figure out length of a symbol. |
| """ |
| return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1) |
| |
| class PrefixDecoder: |
| """A decoder for the Code class that uses a prefix code. |
| The code is determined by encoding: |
| encode[p] gives the index corresponding to bit pattern p. |
| Used setDecode(decodeTable) to switch the decoder from the default |
| to a prefix decoder, or pass decodeTable at init. |
| You can also use setLength(lengthTable) |
| to define the encoding from the lengths. |
| The set of symbol values does not need to be consecutive. |
| """ |
| def __init__(self, *, decodeTable=None, **args): |
| if decodeTable is not None: self.setDecode(decodeTable) |
| |
| def __len__(self): |
| return len(self.decodeTable) |
| |
| def __iter__(self): |
| def revBits(index): |
| return self.bitPattern(index)[::-1] |
| return ( |
| Symbol(self, index) |
| for index in sorted(self.decodeTable.values(), key=revBits) |
| ) |
| |
| def __getitem__(self, index): |
| if index not in self.lengthTable: |
| raise ValueError('No symbol {}[{}]'.format( |
| self.__class__.__name__, index)) |
| return Symbol(self, index) |
| |
| def bitPattern(self, index): |
| bits = next(b for (b,s) in self.decodeTable.items() if s==index) |
| return '{:0{}b}'.format(bits, self.length(index)) |
| |
| def length(self, index): |
| """Encoding length of given symbol. |
| """ |
| return self.lengthTable[index] |
| |
| def decodePeek(self, data): |
| """Find which symbol index matches the given data (from peek, as a number) |
| and return the number of bits decoded. |
| Can also be used to figure out length of a symbol. |
| """ |
| #do binary search for word length |
| #invariant: lo<=length<=hi |
| lo, hi = self.minLength, self.maxLength |
| while lo<=hi: |
| mid = lo+hi>>1 |
| #note lo<=mid<hi at this point |
| mask = (1<<mid)-1 |
| #lets see what happens if we guess length is mid |
| try: index = self.decodeTable[data&mask] |
| except KeyError: |
| #too many bits specified, reduce estimated length |
| hi = mid-1 |
| continue |
| #we found a symbol, but there could be a longer match |
| symbolLength = self.lengthTable[index] |
| if symbolLength<=mid: |
| #all bits match, symbol must be right |
| return symbolLength, Symbol(self, index) |
| #there must be more bits to match |
| lo = mid+1 |
| return lo, Symbol(self, index) |
| |
| #routine to set up the tables |
| def setDecode(self, decodeTable): |
| """Store decodeTable, |
| and compute lengthTable, minLength, maxLength from encodings. |
| """ |
| self.decodeTable = decodeTable |
| #set of symbols with unknown length |
| todo = set(decodeTable) |
| #bit size under investigation |
| maskLength = 0 |
| lengthTable = {} |
| while todo: |
| mask = (1<<maskLength)-1 |
| #split the encodings that we didn't find yet using b bits |
| splitSymbols = defaultdict(list) |
| for s in todo: splitSymbols[s&mask].append(s) |
| #unique encodings have a length of maskLength bits |
| #set length, and remove from todo list |
| for s,subset in splitSymbols.items(): |
| if len(subset)==1: |
| lengthTable[self.decodeTable[s]] = maskLength |
| todo.remove(s) |
| #now investigate with longer mask |
| maskLength +=1 |
| #save result |
| self.lengthTable = lengthTable |
| self.minLength = min(lengthTable.values()) |
| self.maxLength = max(lengthTable.values()) |
| self.switchToPrefix() |
| |
| def setLength(self, lengthTable): |
| """Given the bit pattern lengths for symbols given in lengthTable, |
| set decodeTable, minLength, maxLength |
| """ |
| self.lengthTable = lengthTable |
| self.minLength = min(lengthTable.values()) |
| self.maxLength = max(lengthTable.values()) |
| #compute the backwards codes first; then reverse them |
| #compute (backwards) first code for every separate lengths |
| nextCodes = [] |
| #build codes for each length, from right to left |
| code = 0 |
| for bits in range(self.maxLength+1): |
| code <<= 1 |
| nextCodes.append(code) |
| code += sum(x==bits for x in lengthTable.values()) |
| self.decodeTable = {} |
| #count codes for each length, and store reversed in the table |
| for symbol in sorted(lengthTable): |
| bits = lengthTable[symbol] |
| bitpattern = '{:0{}b}'.format(nextCodes[bits], bits) |
| self.decodeTable[int(bitpattern[::-1], 2)] = symbol |
| nextCodes[bits] += 1 |
| self.switchToPrefix() |
| |
| def switchToPrefix(self): |
| """This routine makes sure the prefix decoder is activated. |
| """ |
| self.mode = PrefixDecoder |
| |
| class Code(RangeDecoder, PrefixDecoder): |
| """An alphabet of symbols, that can be read from a stream. |
| If you use setDecode or setLength, you have a prefix code, |
| otherwise you have a range code. |
| Features: |
| code[index] produces symbol with given index |
| value(index): value of symbol |
| mnemonic(index): short description of symbol |
| explanation(index): show meaning of symbol, shown in Layout.verboseRead |
| iter(code): produce all symbols in some order |
| name: show as context in Layout.verboseRead |
| """ |
| name = '?' |
| #callback is a function that gets the symbol and the extra bits |
| #default callback calls explanation |
| def __init__(self, name=None, *, callback=None, description='', **args): |
| """Don't forget to set either alphabetSize or decodeTable |
| """ |
| #set name when provided, otherwise take class variable |
| if name is not None: self.name = name |
| if callback is not None: self.callback = callback |
| self.description = description |
| #mode switch |
| if 'bitLength' in args or 'alphabetSize' in args: |
| self.mode = RangeDecoder |
| RangeDecoder.__init__(self, **args) |
| elif 'decodeTable' in args: |
| self.mode = PrefixDecoder |
| PrefixDecoder.__init__(self, **args) |
| else: |
| super().__init__(**args) |
| |
| def __repr__(self): |
| return self.__class__.__name__+' '+self.name |
| |
| #the routines that get switched between RangeDecoder and PrefixDecoder |
| def __len__(self): return self.mode.__len__(self) |
| def __iter__(self): return self.mode.__iter__(self) |
| def __getitem__(self, index): return self.mode.__getitem__(self, index) |
| def bitPattern(self, index): return self.mode.bitPattern(self, index) |
| def length(self, index): return self.mode.length(self, index) |
| def decodePeek(self, data): return self.mode.decodePeek(self, data) |
| #general routines |
| def value(self, index, extra=None): |
| """Get value of symbol for computations. |
| Override where needed. |
| """ |
| if extra is not None: |
| raise ValueError('value: no extra for this symbol') |
| return index |
| |
| def mnemonic(self, index): |
| """Give mnemonic of symbol. |
| Override where needed. |
| """ |
| return str(self.value(index)) |
| |
| def callback(self, symbol): |
| return self.explanation(symbol.index) |
| |
| def explanation(self, index): |
| """Long explanation of the value from the numeric value |
| This is a default routine. |
| You can customize in three ways: |
| - set description to add some text |
| - override to get more control |
| - set callback to make it dependent on you local variables |
| """ |
| value = self.value(index) |
| return '{0}{1}: {2}'.format( |
| self.description and self.description+': ', |
| self.bitPattern(index), |
| value, |
| ) |
| |
| def extraBits(self, index): |
| return 0 |
| |
| #Routines that use the decode interface |
| def showCode(self, width=80): |
| """Show all words of the code in a nice format. |
| """ |
| #make table of all symbols with binary strings |
| symbolStrings = [ |
| (self.bitPattern(s.index), self.mnemonic(s.index)) |
| for s in self |
| ] |
| #determine column widths the way Lisp programmers do it |
| leftColWidth, rightColWidth = map(max, map( |
| map, |
| repeat(len), |
| zip(*symbolStrings) |
| )) |
| colwidth = leftColWidth+rightColWidth |
| columns = 81//(colwidth+2) |
| rows = -(-len(symbolStrings)//columns) |
| def justify(bs): |
| b,s = bs |
| return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth) |
| for i in range(rows): |
| print(' '.join(map(justify, symbolStrings[i::rows])).rstrip()) |
| |
| def readTuple(self, stream): |
| """Read symbol from stream. Returns symbol, length. |
| """ |
| length, symbol = self.decodePeek(stream.peek(self.maxLength)) |
| stream.pos += length |
| return length, symbol |
| |
| def readTupleAndExtra(self, stream): |
| return self.readTuple(stream)+(0, None) |
| |
| class WithExtra(Code): |
| """Extension for Code so that symbol may have extra bits associated. |
| If you supply an extraTable, you can use extraBits |
| You can define an extraTable, |
| which allows to call extraBits to get the number of extraBits. |
| Otherwise, you can supply extraBits yourself. |
| Routine readTupleAndExtra now reads the extra bits too. |
| Value probably needs to be overridden; see Enumerator. |
| Note: this does not give you an decodeTable. |
| """ |
| #redefine these if you don't want to use an extraTable |
| def extraBits(self, index): |
| """Get the number of extra bits for this symbol. |
| """ |
| return self.extraTable[index] |
| |
| def mnemonic(self, index): |
| """This value must be independent of extra. |
| """ |
| return str(index) |
| |
| def readTupleAndExtra(self, stream): |
| """Read symbol and extrabits from stream. |
| Returns symbol length, symbol, extraBits, extra |
| >>> olleke.pos = 6 |
| >>> MetablockLengthAlphabet().readTupleAndExtra(olleke) |
| (2, Symbol(MLEN, 4), 16, 46) |
| """ |
| length, symbol = self.decodePeek(stream.peek(self.maxLength)) |
| stream.pos += length |
| extraBits = self.extraBits(symbol.index) |
| return length, symbol, extraBits, stream.read(extraBits) |
| |
| def explanation(self, index, extra=None): |
| """Expanded version of Code.explanation supporting extra bits. |
| If you don't supply extra, it is not mentioned. |
| """ |
| extraBits = 0 if extra is None else self.extraBits(index) |
| if not hasattr(self, 'extraTable'): |
| formatString = '{0}{3}' |
| lo = hi = value = self.value(index, extra) |
| elif extraBits==0: |
| formatString = '{0}{2}: {3}' |
| lo, hi = self.span(index) |
| value = lo |
| else: |
| formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}' |
| lo, hi = self.span(index) |
| value = lo+extra |
| return formatString.format( |
| self.description and self.description+': ', |
| 'x'*extraBits, |
| self.bitPattern(index), |
| lo, hi, |
| extra, |
| value, |
| ) |
| |
| def callback(self, symbol, extra): |
| return self.explanation(symbol.index, extra) |
| |
| class BoolCode(Code): |
| """Same as Code(bitLength=1), but shows a boolean. |
| """ |
| def __init__(self, name=None, **args): |
| super().__init__(name, bitLength=1, **args) |
| |
| def value(self, index, extra=None): |
| return bool(super().value(index, extra)) |
| |
| class Enumerator(WithExtra): |
| """Code that is defined by the ExtraTable. |
| extraTable is a class variable that contains |
| the extraBits of the symbols from 0 |
| value0 contains the value of symbol 0 |
| encodings is not neccessary, but allowed. |
| Note: place for FixedCode to make sure extraBits works |
| """ |
| def __init__(self, name=None, **args): |
| #if there is no decodeTable to determine length, compute it ourselves |
| if 'decodeTable' not in args: |
| args['alphabetSize'] = len(self.extraTable) |
| super().__init__(name, **args) |
| |
| def __len__(self): |
| return len(self.extraTable) |
| |
| def __getitem__(self, index): |
| """Faster than PrefixDecoder |
| """ |
| if index>=len(self.extraTable): |
| raise ValueError("No symbol {}[{}]".format( |
| self.__class__.__name__, index)) |
| return Symbol(self, index) |
| |
| def value(self, index, extra): |
| """Override if you don't define value0 and extraTable |
| """ |
| lower, upper = self.span(index) |
| value = lower+(extra or 0) |
| if value>upper: |
| raise ValueError('value: extra out of range') |
| return value |
| |
| def span(self, index): |
| """Give the range of possible values in a tuple |
| Useful for mnemonic and explanation |
| """ |
| lower = self.value0+sum(1<<x for x in self.extraTable[:index]) |
| upper = lower+(1<<self.extraTable[index]) |
| return lower, upper-1 |
| |
| #======================Code subclasses====================================== |
| #Alphabets used in the metablock header---------------------------------- |
| #For prefix codes |
| class PrefixCodeHeader(WithExtra): |
| """Header of prefix codes. |
| """ |
| def __init__(self, codename): |
| super().__init__('PFX', bitLength=2) |
| #this is the name of the code that it describes |
| self.codename = codename |
| |
| def extraBits(self, index): |
| return 2 if index==1 else 0 |
| |
| def value(self, index, extra): |
| """Returns ('Simple', #codewords) or ('Complex', HSKIP) |
| """ |
| if index==1: |
| if extra>3: |
| raise ValueError('value: extra out of range') |
| return 'Simple', extra+1 |
| if extra: |
| raise ValueError('value: extra out of range') |
| return 'Complex', index |
| |
| def explanation(self, index, extra): |
| if index==1: |
| return '{} is simple with {} code word{}'.format( |
| self.codename, extra+1, 's' if extra else '') |
| lengths = [1, 2, 3, 4, 0, 5, 17, 6] |
| return '{} is complex with lengths {}...'.format( |
| self.codename, |
| ','.join( |
| map(str, lengths[index:index+5])) |
| ) |
| |
| class TreeShapeAlhabet(BoolCode): |
| """The bit used to indicate if four word code is "deep" or "wide" |
| """ |
| name = 'SHAPE' |
| def value(self, index): |
| return [(2,2,2,2), (1,2,3,3)][index] |
| |
| def explanation(self, index): |
| return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index)) |
| |
| class LengthOfLengthAlphabet(Code): |
| """For use in decoding complex code descriptors. |
| >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('') |
| >>> print(lengthOfLengthAlphabet[2]) |
| coded with 2 bits |
| >>> len(lengthOfLengthAlphabet[0]) |
| 2 |
| >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)] |
| [2, 4, 3, 2, 2, 4] |
| >>> lengthOfLengthAlphabet.showCode() |
| 00:skipped 01:coded with 4 bits 0111:coded with 1 bits |
| 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits |
| """ |
| decodeTable = { |
| 0b00:0, 0b10:3, |
| 0b0111:1, 0b01:4, |
| 0b011:2, 0b1111:5, |
| } |
| |
| def __init__(self, name=None, **args): |
| super().__init__(name, decodeTable=self.decodeTable, **args) |
| |
| def mnemonic(self, index): |
| if index==0: return 'skipped' |
| return 'coded with {} bits'.format(index) |
| |
| def explanation(self, index, extra=None): |
| return self.description+': '+self.mnemonic(index) |
| |
| class LengthAlphabet(WithExtra): |
| """Length of symbols |
| Used during construction of a code. |
| """ |
| def __init__(self, name): |
| super().__init__(name, alphabetSize=18) |
| |
| def extraBits(self, index): |
| return {16:2, 17:3}.get(index, 0) |
| |
| def mnemonic(self, index): |
| if index==0: return 'unused' |
| elif index==16: return 'rep xx' |
| elif index==17: return 'zero xxx' |
| else: return 'len {}'.format(index) |
| |
| def explanation(self, index, extra): |
| return self.description.format(self[index], extra) |
| |
| def value(self, index, extra): |
| #the caller got the length already, so extra is enough |
| return extra |
| |
| #Stream header |
| class WindowSizeAlphabet(Code): |
| """The alphabet used for window size in the stream header. |
| >>> WindowSizeAlphabet()[10].explanation() |
| 'windowsize=(1<<10)-16=1008' |
| """ |
| decodeTable = { |
| 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22, |
| 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23, |
| 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24, |
| 0b1010001: 13, 0b0000001: 17, 0b1001: 21, |
| 0b0010001: None, |
| } |
| |
| name = 'WSIZE' |
| |
| def __init__(self, name=None): |
| super().__init__(name, decodeTable=self.decodeTable) |
| |
| def value(self, index): |
| #missing value gives index None |
| if index is None: return None |
| return (1<<index)-16 |
| |
| def explanation(self, index): |
| return 'windowsize=(1<<{})-16={}'.format( |
| index, (1<<index)-16) |
| |
| #Metablock |
| class MetablockLengthAlphabet(WithExtra): |
| """Used for the meta block length; |
| also indicates a block with no data |
| >>> metablockLengthAlphabet = MetablockLengthAlphabet() |
| >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0]) |
| Symbol(MLEN, 0) |
| 'empty' |
| >>> metablockLengthAlphabet[3] |
| Traceback (most recent call last): |
| ... |
| ValueError: No symbol MetablockLengthAlphabet[3] |
| >>> print(metablockLengthAlphabet[4]) |
| hhhh00 |
| >>> metablockLengthAlphabet[4].value(0x1000) |
| 4097 |
| >>> metablockLengthAlphabet[5].value(0x1000) |
| Traceback (most recent call last): |
| ... |
| InvalidStream: Zeros in high nibble of MLEN |
| >>> metablockLengthAlphabet[5].explanation(0x12345) |
| 'data length: 12345h+1=74566' |
| >>> metablockLengthAlphabet.showCode() |
| 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty |
| """ |
| decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6} |
| |
| name = 'MLEN' |
| def __init__(self, name=None): |
| super().__init__(name, decodeTable=self.decodeTable) |
| |
| def extraBits(self, index): |
| return index*4 |
| |
| def mnemonic(self, index): |
| if index==0: return 'empty' |
| return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) |
| |
| def value(self, index, extra): |
| extraBits = self.extraBits(index) |
| if not 0<=extra<1<<extraBits: |
| raise ValueError('value: extra out of range') |
| if index==0: return 0 |
| if index>4 and extra>>extraBits-4==0: raise InvalidStream( |
| 'Zeros in high nibble of MLEN') |
| return extra+1 |
| |
| def explanation(self, index, extra): |
| if index==0: return '11: empty block' |
| extraBits = self.extraBits(index) |
| return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1) |
| |
| |
| class ReservedAlphabet(BoolCode): |
| """The reserved bit that must be zero. |
| """ |
| name = 'RSVD' |
| def value(self, index): |
| if index: raise ValueError('Reserved bit is not zero') |
| |
| def explanation(self, index): |
| return 'Reserved (must be zero)' |
| |
| class FillerAlphabet(Code): |
| def __init__(self, *, streamPos): |
| super().__init__('SKIP', bitLength=(-streamPos)&7) |
| |
| def explanation(self, index): |
| return '{} bit{} ignored'.format( |
| self.length(index), |
| '' if self.length(index)==1 else 's', |
| ) |
| |
| class SkipLengthAlphabet(WithExtra): |
| """Used for the skip length in an empty metablock |
| >>> skipLengthAlphabet = SkipLengthAlphabet() |
| >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0]) |
| Symbol(SKIP, 0) |
| 'empty' |
| >>> skipLengthAlphabet[4] |
| Traceback (most recent call last): |
| ... |
| ValueError: index out of range |
| >>> print(skipLengthAlphabet[3]) |
| hhhhhh11 |
| >>> skipLengthAlphabet[2].value(0x1000) |
| 4097 |
| >>> skipLengthAlphabet[3].value(0x1000) |
| Traceback (most recent call last): |
| ... |
| InvalidStream: Zeros in high byte of SKIPBYTES |
| >>> skipLengthAlphabet[3].explanation(0x12345) |
| 'skip length: 12345h+1=74566' |
| >>> skipLengthAlphabet.showCode() |
| 00:empty 01:hh01 10:hhhh10 11:hhhhhh11 |
| """ |
| def __init__(self): |
| super().__init__('SKIP', bitLength=2) |
| |
| def extraBits(self, index): |
| return index*8 |
| |
| def mnemonic(self, index): |
| if index==0: return 'empty' |
| return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) |
| |
| def value(self, index, extra): |
| extraBits = self.extraBits(index) |
| if not 0<=extra<1<<extraBits: |
| raise ValueError('value: extra out of range') |
| if index==0: return 0 |
| if index>1 and extra>>extraBits-8==0: |
| raise InvalidStream('Zeros in high byte of SKIPBYTES') |
| return extra+1 |
| |
| def explanation(self, index, extra): |
| if index==0: return '00: no skip' |
| extraBits = self.extraBits(index) |
| return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1) |
| |
| |
| class TypeCountAlphabet(Enumerator): |
| """Used for giving block type counts and tree counts. |
| >>> TypeCountAlphabet(description='').showCode() |
| 0:0 0101:xx,0101 1011:xxxxx,1011 |
| 0001:0001 1101:xxxxxx,1101 0111:xxx,0111 |
| 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111 |
| """ |
| decodeTable = { |
| 0b0: 0, 0b1001: 5, |
| 0b0001: 1, 0b1011: 6, |
| 0b0011: 2, 0b1101: 7, |
| 0b0101: 3, 0b1111: 8, |
| 0b0111: 4, |
| } |
| |
| value0 = 1 |
| extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7] |
| name = 'BT#' |
| |
| def __init__(self, name=None, *, description): |
| super().__init__( |
| name, |
| decodeTable=self.decodeTable, |
| description=description) |
| |
| def mnemonic(self, index): |
| if index==0: return '0' |
| if index==1: return '0001' |
| return 'x'*(self.extraBits(index))+','+self.bitPattern(index) |
| |
| def explanation(self, index, extra): |
| value = self.value(index, extra) |
| description = self.description |
| if value==1: description = description[:-1] |
| return '{}: {} {}'.format( |
| self.mnemonic(index), |
| value, |
| description) |
| |
| class BlockTypeAlphabet(Code): |
| """The block types; this code works for all three kinds. |
| >>> b = BlockTypeAlphabet('T', NBLTYPES=5) |
| >>> print(*(x for x in b)) |
| prev +1 #0 #1 #2 #3 #4 |
| """ |
| def __init__(self, name, NBLTYPES, **args): |
| super().__init__(name, alphabetSize=NBLTYPES+2, **args) |
| self.NBLTYPES = NBLTYPES |
| |
| def mnemonic(self, index): |
| if index==0: return 'prev' |
| elif index==1: return '+1' |
| else: return '#'+str(index-2) |
| |
| def value(self, index): |
| return index-2 |
| |
| def explanation(self, index): |
| if index==0: return '0: previous' |
| elif index==1: return '1: increment' |
| else: return 'Set block type to: '+str(index-2) |
| |
| class BlockCountAlphabet(Enumerator): |
| """Block counts |
| >>> b = BlockCountAlphabet('L') |
| >>> print(b[25]) |
| [24*x]: BC16625-16793840 |
| """ |
| |
| value0 = 1 |
| extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24] |
| def __init__(self, name, **args): |
| super().__init__(name, alphabetSize=26, **args) |
| |
| def mnemonic(self, index): |
| extraBits = self.extraBits(index) |
| return '{}: BC{}-{}'.format( |
| 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits), |
| *self.span(index)) |
| |
| def explanation(self, index, extra): |
| return 'Block count: '+super().explanation(index, extra) |
| |
| class DistanceParamAlphabet(WithExtra): |
| """The distance parameters NPOSTFIX and NDIRECT. |
| Although these are treated as two in the description, this is easier. |
| """ |
| def __init__(self): |
| super().__init__('DIST', bitLength=2) |
| |
| def extraBits(self, index): |
| return 4 |
| |
| def value(self, index, extra): |
| """Returns NPOSTFIX and NDIRECT<<NPOSTFIX |
| """ |
| if extra>15: |
| raise ValueError('value: extra out of range') |
| return index, extra<<index |
| |
| def explanation(self, index, extra): |
| return '{} postfix bits and {:04b}<<{}={} direct codes'.format( |
| index, extra, index, extra<<index) |
| |
| def mnemonic(self, index): |
| return 'PF'+str(index) |
| |
| class LiteralContextMode(Code): |
| """For the literal context modes. |
| >>> LiteralContextMode().showCode() |
| 00:LSB6 01:MSB6 10:UTF8 11:Signed |
| >>> LiteralContextMode().explanation(2) |
| 'Context mode for type 9: 2(UTF8)' |
| """ |
| |
| def __init__(self, *, number=9): |
| super().__init__('LC'+str(number), bitLength=2) |
| self.number = number |
| |
| def mnemonic(self, index): |
| return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index] |
| |
| def explanation(self, index): |
| return 'Context mode for type {}: {}({})'.format( |
| self.number, |
| index, |
| self.mnemonic(index)) |
| |
| class RLEmaxAlphabet(Enumerator): |
| """Used for describing the run length encoding used for describing context maps. |
| >>> RLEmaxAlphabet().showCode() |
| 0:1 1:more |
| """ |
| value0 = 0 |
| extraTable = [0, 4] |
| name = 'RLE#' |
| |
| def mnemonic(self, index): |
| return ['1', 'more'][index] |
| |
| def explanation(self, index, extra): |
| description = self.description and self.description+': ' |
| if index==0: return description+'No RLE coding' |
| return '{}xxxx 1: RLEMAX={}'.format(description, extra+1) |
| |
| class TreeAlphabet(WithExtra): |
| """The alphabet to enumerate entries (called trees) in the context map. |
| parameters are RLEMAX and NTREES |
| >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5) |
| >>> len(t) |
| 8 |
| >>> print(t[2]) |
| xx+4 zeroes |
| >>> t[3].explanation(2) |
| '8+010=10 zeroes' |
| >>> t[0].value(0) |
| (1, 0) |
| """ |
| name = 'CMI' |
| def __init__(self, name=None, *, RLEMAX, NTREES, **args): |
| super().__init__(name, alphabetSize=RLEMAX+NTREES, **args) |
| self.RLEMAX = RLEMAX |
| self.NTREES = NTREES |
| |
| def extraBits(self, index): |
| if 0<index<=self.RLEMAX: return index |
| return 0 |
| |
| def mnemonic(self, index): |
| if index==0: return 'map #0' |
| if index<=self.RLEMAX: |
| return '{}+{} zeroes'.format('x'*index, 1<<index) |
| return 'map #{}'.format(index-self.RLEMAX) |
| |
| def value(self, index, extra): |
| """Give count and value.""" |
| index = index |
| if index==0: return 1, 0 |
| if index<=self.RLEMAX: return (1<<index)+extra, 0 |
| return 1, index-self.RLEMAX |
| |
| def explanation(self, index, extra): |
| description = self.description and self.description+': ' |
| if index==0: return description+'map #0' |
| if index<=self.RLEMAX: |
| return '{}+{:0{}b}={} zeroes'.format( |
| (1<<index), |
| extra, self.extraBits(index), |
| (1<<index)+extra) |
| return '{}map #{}-{}={}'.format( |
| description, |
| index, self.RLEMAX, index-self.RLEMAX) |
| |
| #Prefix alphabets for the data stream---------------------------------- |
| class LiteralAlphabet(Code): |
| """Alphabet of symbols. |
| """ |
| minLength = maxLength = 8 |
| def __init__(self, number): |
| super().__init__('L'+str(number), alphabetSize=1<<8) |
| |
| def mnemonic(self, index): |
| return outputCharFormatter(index) |
| |
| def value(self, index, extra=None): |
| return index |
| |
| def explanation(self, index, extra=None): |
| return self.mnemonic(index) |
| |
| class InsertLengthAlphabet(Enumerator): |
| """Intern code for insert counts |
| """ |
| value0 = 0 |
| extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24] |
| |
| class CopyLengthAlphabet(Enumerator): |
| value0 = 2 |
| extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24] |
| |
| class InsertAndCopyAlphabet(WithExtra): |
| """The insert and copy code |
| >>> for x in range(0,704,704//13): |
| ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x]) |
| 0 I0C2&D=0 |
| 110110 I6+xC8&D=0 |
| 1101100 I5C22+xxx&D=0 |
| 10100010 I4C4 |
| 11011000 I3C10+x |
| 100001110 I14+xxC8 |
| 101000100 I10+xxC22+xxx |
| 101111010 I98+xxxxxC14+xx |
| 110110000 I6+xC70+xxxxx |
| 111100110 I1090+[10*x]C8 |
| 1000011100 I26+xxxC326+[8*x] |
| 1001010010 I322+[8*x]C14+xx |
| 1010001000 I194+[7*x]C70+xxxxx |
| 1010111110 I22594+[24*x]C1094+[10*x] |
| """ |
| insertLengthAlphabet = InsertLengthAlphabet(None) |
| copyLengthAlphabet = CopyLengthAlphabet(None) |
| |
| def __init__(self, number=''): |
| super().__init__('IC'+str(number), bitLength=10) |
| |
| def __len__(self): |
| return 704 |
| |
| def extraBits(self, index): |
| insertSymbol, copySymbol, dist0 = self.splitSymbol(index) |
| return InsertLengthAlphabet.extraTable[insertSymbol.index] + \ |
| CopyLengthAlphabet.extraTable[copySymbol.index] |
| |
| def splitSymbol(self, index): |
| """Give relevant values for computations: |
| (insertSymbol, copySymbol, dist0flag) |
| """ |
| #determine insert and copy upper bits from table |
| row = [0,0,1,1,2,2,1,3,2,3,3][index>>6] |
| col = [0,1,0,1,0,1,2,0,2,1,2][index>>6] |
| #determine inserts and copy sub codes |
| insertLengthCode = row<<3 | index>>3&7 |
| if row: insertLengthCode -= 8 |
| copyLengthCode = col<<3 | index&7 |
| return ( |
| Symbol(self.insertLengthAlphabet, insertLengthCode), |
| Symbol(self.copyLengthAlphabet, copyLengthCode), |
| row==0 |
| ) |
| |
| def mnemonic(self, index): |
| """Make a nice mnemonic |
| """ |
| i,c,d0 = self.splitSymbol(index) |
| iLower, _ = i.code.span(i.index) |
| iExtra = i.extraBits() |
| cLower, _ = c.code.span(c.index) |
| cExtra = c.extraBits() |
| return 'I{}{}{}C{}{}{}{}'.format( |
| iLower, |
| '+' if iExtra else '', |
| 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra), |
| cLower, |
| '+' if cExtra else '', |
| 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra), |
| '&D=0' if d0 else '') |
| |
| def value(self, index, extra): |
| i,c,d0 = self.splitSymbol(index) |
| iExtra = i.extraBits() |
| ce, ie = extra>>iExtra, extra&(1<<iExtra)-1 |
| insert = i.value(ie) |
| copy = c.value(ce) |
| return insert, copy, d0 |
| |
| def explanation(self, index, extra): |
| insert, copy, d0 = self.value(index, extra) |
| if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy) |
| else: return 'Literal: {}, copy: {}'.format(insert, copy) |
| |
| class DistanceAlphabet(WithExtra): |
| """Represent the distance encoding. |
| Dynamically generated alphabet. |
| This is what the documentation should have said: |
| Ignoring offsets for the moment, the "long" encoding works as follows: |
| Write the distance in binary as follows: |
| 1xy..yz..z, then the distance symbol consists of n..nxz..z |
| Where: |
| n is one less than number of bits in y |
| x is a single bit |
| y..y are n+1 extra bits (encoded in the bit stream) |
| z..z is NPOSTFIX bits that are part of the symbol |
| The offsets are so as to start at the lowest useable value: |
| if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1 |
| then n..nxz..z is symbol -NDIRECT-16 |
| >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) |
| >>> print(d[4], d[17], d[34]) |
| last-1 1 10xx00-5 |
| >>> [str(d[x]) for x in range(26, 32)] |
| ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5'] |
| """ |
| def __init__(self, number, *, NPOSTFIX, NDIRECT): |
| self.NPOSTFIX = NPOSTFIX |
| self.NDIRECT = NDIRECT |
| #set length |
| #Actually, not all symbols are used, |
| #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX) |
| super().__init__('D'+str(number), |
| alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX)) |
| |
| def extraBits(self, index): |
| """Indicate how many extra bits are needed to interpret symbol |
| >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) |
| >>> [d[i].extraBits() for i in range(26)] |
| [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] |
| >>> [d[i].extraBits() for i in range(26,36)] |
| [1, 1, 1, 1, 1, 1, 1, 1, 2, 2] |
| """ |
| if index<16+self.NDIRECT: return 0 |
| return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) |
| |
| def value(self, dcode, dextra): |
| """Decode value of symbol together with the extra bits. |
| >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) |
| >>> d[34].value(2) |
| (0, 35) |
| """ |
| if dcode<16: |
| return [(1,0),(2,0),(3,0),(4,0), |
| (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3), |
| (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3) |
| ][dcode] |
| if dcode<16+self.NDIRECT: |
| return (0,dcode-16) |
| #we use the original formulas, instead of my clear explanation |
| POSTFIX_MASK = (1 << self.NPOSTFIX) - 1 |
| ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) |
| hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX |
| lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK |
| offset = ((2 + (hcode & 1)) << ndistbits) - 4 |
| distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1 |
| return (0,distance) |
| |
| def mnemonic(self, index, verbose=False): |
| """Give mnemonic representation of meaning. |
| verbose compresses strings of x's |
| """ |
| if index<16: |
| return ['last', '2last', '3last', '4last', |
| 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3', |
| '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3' |
| ][index] |
| if index<16+self.NDIRECT: |
| return str(index-16) |
| #construct strings like "1xx01-15" |
| index -= self.NDIRECT+16 |
| hcode = index >> self.NPOSTFIX |
| lcode = index & (1<<self.NPOSTFIX)-1 |
| if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}' |
| else: formatString = '1{0}{1}{4:+d}' |
| return formatString.format( |
| hcode&1, |
| 'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1), |
| lcode, self.NPOSTFIX, |
| self.NDIRECT+1-(4<<self.NPOSTFIX)) |
| |
| def explanation(self, index, extra): |
| """ |
| >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) |
| >>> d[55].explanation(13) |
| '11[1101]01-5: [0]+240' |
| """ |
| extraBits = self.extraBits(index) |
| extraString = '[{:0{}b}]'.format(extra, extraBits) |
| return '{0}: [{1[0]}]{1[1]:+d}'.format( |
| self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString), |
| self.value(index, extra)) |
| |
| #Classes for doing actual work------------------------------------------ |
| class ContextModeKeeper: |
| """For computing the literal context mode. |
| You feed it characters, and it computes indices in the context map. |
| """ |
| def __init__(self, mode): |
| self.chars = deque([0,0], maxlen=2) |
| self.mode = mode |
| |
| def setContextMode(self, mode): |
| """Switch to given context mode (0..3)""" |
| self.mode = mode |
| def getIndex(self): |
| if self.mode==0: #LSB6 |
| return self.chars[1]&0x3f |
| elif self.mode==1: #MSB6 |
| return self.chars[1]>>2 |
| elif self.mode==2: #UTF8: character class of previous and a bit of the second |
| p2,p1 = self.chars |
| return self.lut0[p1]|self.lut1[p2] |
| elif self.mode==3: #Signed: initial bits of last two bytes |
| p2,p1 = self.chars |
| return self.lut2[p1]<<3|self.lut2[p2] |
| |
| def add(self, index): |
| """Adjust the context for output char (as int).""" |
| self.chars.append(index) |
| |
| #0: control #16: quote #32: ,:; #48: AEIOU |
| #4: tab/lf/cr #20: % #36: . #52: BC..Z |
| #8: space #24: (<[{ #40: = #56: aeiou |
| #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z |
| lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12, |
| 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12, |
| 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48, |
| 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12, |
| 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56, |
| 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0 |
| ]+[0,1]*32+[2,3]*32 |
| #0: space 1:punctuation 2:digit/upper 3:lower |
| lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, |
| 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, |
| 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0 |
| ]+[0]*96+[2]*32 |
| #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1 |
| lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7] |
| assert len(lut0)==len(lut1)==len(lut2)==256 |
| |
| class WordList: |
| """Word list. |
| >>> WordList().word(7, 35555) |
| b'Program to ' |
| """ |
| NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10, |
| 10, 10, 10, 9, 9, 8, 7, 7, 8, 7, |
| 7, 6, 6, 5, 5] |
| def __init__(self): |
| self.file = open('dict', 'rb') |
| self.compileActions() |
| |
| def word(self, size, dist): |
| """Get word |
| """ |
| #split dist in index and action |
| ndbits = self.NDBITS[size] |
| index = dist&(1<<ndbits)-1 |
| action = dist>>ndbits |
| #compute position in file |
| position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index |
| self.file.seek(position) |
| return self.doAction(self.file.read(size), action) |
| |
| def upperCase1(self, word): |
| word = word.decode('utf8') |
| word = word[0].upper()+word[1:] |
| return word.encode('utf8') |
| |
| |
| #Super compact form of action table. |
| #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst |
| actionTable = r""" |
| 0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_ |
| 1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+. |
| 2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w |
| 3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+, |
| 4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+=" |
| 5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+=" |
| 6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_ |
| 7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_ |
| 8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\' |
| 9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+, |
| 10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+=" |
| 11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_ |
| 12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+, |
| 13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+( |
| 14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._ |
| 15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+. |
| 16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\' |
| 17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._ |
| 18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+=" |
| 19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\' |
| 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\' |
| 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+. |
| 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+"> |
| 23:w[:-3] 48:w[:-7] 98:_+w+=\' |
| 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+, |
| """ |
| |
| def compileActions(self): |
| """Build the action table from the text above |
| """ |
| import re |
| self.actionList = actions = [None]*121 |
| #Action 73, which is too long, looks like this when expanded: |
| actions[73] = "b' the '+w+b' of the '" |
| #find out what the columns are |
| actionLines = self.actionTable.splitlines() |
| colonPositions = [m.start() |
| for m in re.finditer(':',actionLines[1]) |
| ]+[100] |
| columns = [(colonPositions[i]-3,colonPositions[i+1]-3) |
| for i in range(len(colonPositions)-1)] |
| for line in self.actionTable.splitlines(keepends=False): |
| for start,end in columns: |
| action = line[start:end] |
| #skip empty actions |
| if not action or action.isspace(): continue |
| #chop it up, and check if the colon is properly placed |
| index, colon, action = action[:3], action[3], action[4:] |
| assert colon==':' |
| #remove filler spaces at right |
| action = action.rstrip() |
| #replace space symbols |
| action = action.replace('_', ' ') |
| wPos = action.index('w') |
| #add quotes around left string when present |
| #translation: any pattern from beginning, up to |
| #(but not including) a + following by a w later on |
| action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action) |
| #add quotes around right string when present |
| #translation: anything with a w in it, followed by a + |
| #and a pattern up to the end |
| #(there is no variable lookbehind assertion, |
| #so we have to copy the pattern) |
| action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action) |
| #expand shortcut for uppercaseAll |
| action = action.replace(".U", ".upper()") |
| #store action |
| actions[int(index)] = action |
| |
| def doAction(self, w, action): |
| """Perform the proper action |
| """ |
| #set environment for the UpperCaseFirst |
| U = self.upperCase1 |
| return eval(self.actionList[action], locals()) |
| |
| class Layout: |
| """Class to layout the output. |
| """ |
| #display width of hexdata+bitdata |
| width = 25 |
| #general |
| def __init__(self, stream): |
| self.stream = stream |
| self.bitPtr = self.width |
| |
| def makeHexData(self, pos): |
| """Produce hex dump of all data containing the bits |
| from pos to stream.pos |
| """ |
| firstAddress = pos+7>>3 |
| lastAddress = self.stream.pos+7>>3 |
| return ''.join(map('{:02x} '.format, |
| self.stream.data[firstAddress:lastAddress])) |
| |
| def formatBitData(self, pos, width1, width2=0): |
| """Show formatted bit data: |
| Bytes are separated by commas |
| whole bytes are displayed in hex |
| >>> Layout(olleke).formatBitData(6, 2, 16) |
| '|00h|2Eh,|00' |
| >>> Layout(olleke).formatBitData(4, 1, 0) |
| '1' |
| """ |
| result = [] |
| #make empty prefix code explicit |
| if width1==0: result = ['()', ','] |
| for width in width1, width2: |
| #skip empty width2 |
| if width==0: continue |
| #build result backwards in a list |
| while width>0: |
| availableBits = 8-(pos&7) |
| if width<availableBits: |
| #read partial byte, beginning nor ending at boundary |
| data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1 |
| result.append('{:0{}b}'.format(data, width)) |
| elif availableBits<8: |
| #read rest of byte, ending at boundary |
| data = self.stream.data[pos>>3] >> (pos&7) |
| result.append('|{:0{}b}'.format(data, availableBits)) |
| else: |
| #read whole byte (in hex), beginning and ending at boundary |
| data = self.stream.data[pos>>3] |
| result.append('|{:02X}h'.format(data)) |
| width -= availableBits |
| pos += availableBits |
| #if width overshot from the availableBits subtraction, fix it |
| pos += width |
| #add comma to separate fields |
| result.append(',') |
| #concatenate pieces, reversed, skipping the last space |
| return ''.join(result[-2::-1]) |
| |
| def readPrefixCode(self, alphabet): |
| """give alphabet the prefix code that is read from the stream |
| Called for the following alphabets, in this order: |
| The alphabet in question must have a "logical" order, |
| otherwise the assignment of symbols doesn't work. |
| """ |
| mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name)) |
| if mode=='Complex': |
| #for a complex code, numberOfSymbols means hskip |
| self.readComplexCode(numberOfSymbols, alphabet) |
| return alphabet |
| else: |
| table = [] |
| #Set table of lengths for mnemonic function |
| lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1] |
| #adjust mnemonic function of alphabet class |
| def myMnemonic(index): |
| return '{} bit{}: {}'.format( |
| lengths[i], |
| '' if lengths[i]==1 else 's', |
| alphabet.__class__.mnemonic(alphabet, index) |
| ) |
| alphabet.mnemonic = myMnemonic |
| for i in range(numberOfSymbols): |
| table.append(self.verboseRead(alphabet, skipExtra=True).index) |
| #restore mnemonic |
| del alphabet.mnemonic |
| if numberOfSymbols==4: |
| #read tree shape to redefine lengths |
| lengths = self.verboseRead(TreeShapeAlhabet()) |
| #construct the alphabet prefix code |
| alphabet.setLength(dict(zip(table, lengths))) |
| return alphabet |
| |
| def readComplexCode(self, hskip, alphabet): |
| """Read complex code""" |
| stream = self.stream |
| #read the lengths for the length code |
| lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:] |
| codeLengths = {} |
| total = 0 |
| lol = LengthOfLengthAlphabet('##'+alphabet.name) |
| #lengthCode will be used for coding the lengths of the new code |
| #we use it for display until now; definition comes below |
| lengthCode = LengthAlphabet('#'+alphabet.name) |
| lengthIter = iter(lengths) |
| lengthsLeft = len(lengths) |
| while total<32 and lengthsLeft>0: |
| lengthsLeft -= 1 |
| newSymbol = next(lengthIter) |
| lol.description = str(lengthCode[newSymbol]) |
| length = self.verboseRead(lol) |
| if length: |
| codeLengths[newSymbol] = length |
| total += 32>>length |
| if total>32: raise ValueError("Stream format") |
| if len(codeLengths)==1: codeLengths[list(codeLengths.keys())[0]] = 0 |
| #Now set the encoding of the lengthCode |
| lengthCode.setLength(codeLengths) |
| print("***** Lengths for {} will be coded as:".format(alphabet.name)) |
| lengthCode.showCode() |
| #Now determine the symbol lengths with the lengthCode |
| symbolLengths = {} |
| total = 0 |
| lastLength = 8 |
| alphabetIter = iter(alphabet) |
| while total<32768: |
| #look ahead to see what is going to happen |
| length = lengthCode.decodePeek( |
| self.stream.peek(lengthCode.maxLength))[1].index |
| #in every branch, set lengthCode.description to explanatory text |
| #lengthCode calls format(symbol, extra) with this string |
| if length==0: |
| symbol = next(alphabetIter) |
| lengthCode.description = 'symbol {} unused'.format(symbol) |
| self.verboseRead(lengthCode) |
| #unused symbol |
| continue |
| if length==16: |
| lengthCode.description = \ |
| '{1}+3 symbols of length '+str(lastLength) |
| extra = self.verboseRead(lengthCode) |
| #scan series of 16s (repeat counts) |
| #start with repeat count 2 |
| repeat = 2 |
| startSymbol = next(alphabetIter) |
| endSymbol = next(alphabetIter) |
| symbolLengths[startSymbol.index] = \ |
| symbolLengths[endSymbol.index] = lastLength |
| #count the two just defined symbols |
| total += 2*32768>>lastLength |
| #note: loop may end because we're there |
| #even if a 16 _appears_ to follow |
| while True: |
| #determine last symbol |
| oldRepeat = repeat |
| repeat = (repeat-2<<2)+extra+3 |
| #read as many symbols as repeat increased |
| for i in range(oldRepeat, repeat): |
| endSymbol = next(alphabetIter) |
| symbolLengths[endSymbol.index] = lastLength |
| #compute new total; it may be end of loop |
| total += (repeat-oldRepeat)*32768>>lastLength |
| if total>=32768: break |
| #see if there is more to do |
| length = lengthCode.decodePeek( |
| self.stream.peek(lengthCode.maxLength))[1].index |
| if length!=16: break |
| lengthCode.description = 'total {}+{{1}} symbols'.format( |
| (repeat-2<<2)+3) |
| extra = self.verboseRead(lengthCode) |
| elif length==17: |
| #read, and show explanation |
| lengthCode.description = '{1}+3 unused' |
| extra = self.verboseRead(lengthCode) |
| #scan series of 17s (groups of zero counts) |
| #start with repeat count 2 |
| repeat = 2 |
| startSymbol = next(alphabetIter) |
| endSymbol = next(alphabetIter) |
| #note: loop will not end with total==32768, |
| #since total doesn't change here |
| while True: |
| #determine last symbol |
| oldRepeat = repeat |
| repeat = (repeat-2<<3)+extra+3 |
| #read as many symbols as repeat increases |
| for i in range(repeat-oldRepeat): |
| endSymbol = next(alphabetIter) |
| #see if there is more to do |
| length = lengthCode.decodePeek( |
| self.stream.peek(lengthCode.maxLength))[1].index |
| if length!=17: break |
| lengthCode.description = 'total {}+{{1}} unused'.format( |
| (repeat-2<<3)+3) |
| extra = self.verboseRead(lengthCode) |
| else: |
| symbol = next(alphabetIter) |
| #double braces for format |
| char = str(symbol) |
| if char in '{}': char *= 2 |
| lengthCode.description = \ |
| 'Length for {} is {{0.index}} bits'.format(char) |
| #output is not needed (will be 0) |
| self.verboseRead(lengthCode) |
| symbolLengths[symbol.index] = length |
| total += 32768>>length |
| lastLength = length |
| assert total==32768 |
| alphabet.setLength(symbolLengths) |
| print('End of table. Prefix code '+alphabet.name+':') |
| alphabet.showCode() |
| |
| #stream |
| def processStream(self): |
| """Process a brotli stream. |
| """ |
| print('addr hex{:{}s}binary context explanation'.format( |
| '', self.width-10)) |
| print('Stream header'.center(60, '-')) |
| self.windowSize = self.verboseRead(WindowSizeAlphabet()) |
| print('Metablock header'.center(60, '=')) |
| self.ISLAST = False |
| self.output = bytearray() |
| while not self.ISLAST: |
| self.ISLAST = self.verboseRead( |
| BoolCode('LAST', description="Last block")) |
| if self.ISLAST: |
| if self.verboseRead( |
| BoolCode('EMPTY', description="Empty block")): break |
| if self.metablockLength(): continue |
| if not self.ISLAST and self.uncompressed(): continue |
| print('Block type descriptors'.center(60, '-')) |
| self.numberOfBlockTypes = {} |
| self.currentBlockCounts = {} |
| self.blockTypeCodes = {} |
| self.blockCountCodes = {} |
| for blockType in (L,I,D): self.blockType(blockType) |
| print('Distance code parameters'.center(60, '-')) |
| self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet()) |
| self.readLiteralContextModes() |
| print('Context maps'.center(60, '-')) |
| self.cmaps = {} |
| #keep the number of each kind of prefix tree for the last loop |
| numberOfTrees = {I: self.numberOfBlockTypes[I]} |
| for blockType in (L,D): |
| numberOfTrees[blockType] = self.contextMap(blockType) |
| print('Prefix code lists'.center(60, '-')) |
| self.prefixCodes = {} |
| for blockType in (L,I,D): |
| self.readPrefixArray(blockType, numberOfTrees[blockType]) |
| self.metablock() |
| |
| #metablock header |
| def verboseRead(self, alphabet, context='', skipExtra=False): |
| """Read symbol and extra from stream and explain what happens. |
| Returns the value of the symbol |
| >>> olleke.pos = 0 |
| >>> l = Layout(olleke) |
| >>> l.verboseRead(WindowSizeAlphabet()) |
| 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| 4194288 |
| """ |
| #TODO 2: verbosity level, e.g. show only codes and maps in header |
| stream = self.stream |
| pos = stream.pos |
| if skipExtra: |
| length, symbol = alphabet.readTuple(stream) |
| extraBits, extra = 0, None |
| else: |
| length, symbol, extraBits, extra = alphabet.readTupleAndExtra( |
| stream) |
| #fields: address, hex data, binary data, name of alphabet, explanation |
| hexdata = self.makeHexData(pos) |
| addressField = '{:04x}'.format(pos+7>>3) if hexdata else '' |
| bitdata = self.formatBitData(pos, length, extraBits) |
| #bitPtr moves bitdata so that the bytes are easier to read |
| #jump back to right if a new byte starts |
| if '|' in bitdata[1:]: |
| #start over on the right side |
| self.bitPtr = self.width |
| fillWidth = self.bitPtr-(len(hexdata)+len(bitdata)) |
| if fillWidth<0: fillWidth = 0 |
| print('{:<5s} {:<{}s} {:7s} {}'.format( |
| addressField, |
| hexdata+' '*fillWidth+bitdata, self.width, |
| context+alphabet.name, |
| symbol if skipExtra else symbol.explanation(extra), |
| )) |
| #jump to the right if we started with a '|' |
| #because we didn't jump before printing |
| if bitdata.startswith('|'): self.bitPtr = self.width |
| else: self.bitPtr -= len(bitdata) |
| return symbol if skipExtra else symbol.value(extra) |
| |
| def metablockLength(self): |
| """Read MNIBBLES and meta block length; |
| if empty block, skip block and return true. |
| """ |
| self.MLEN = self.verboseRead(MetablockLengthAlphabet()) |
| if self.MLEN: |
| return False |
| #empty block; skip and return False |
| self.verboseRead(ReservedAlphabet()) |
| MSKIP = self.verboseRead(SkipLengthAlphabet()) |
| self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) |
| self.stream.pos += 8*MSKIP |
| print("Skipping to {:x}".format(self.stream.pos>>3)) |
| return True |
| |
| def uncompressed(self): |
| """If true, handle uncompressed data |
| """ |
| ISUNCOMPRESSED = self.verboseRead( |
| BoolCode('UNCMPR', description='Is uncompressed?')) |
| if ISUNCOMPRESSED: |
| self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) |
| print('Uncompressed data:') |
| self.output += self.stream.readBytes(self.MLEN) |
| print(outputFormatter(self.output[-self.MLEN:])) |
| return ISUNCOMPRESSED |
| |
| def blockType(self, kind): |
| """Read block type switch descriptor for given kind of blockType.""" |
| NBLTYPES = self.verboseRead(TypeCountAlphabet( |
| 'BT#'+kind[0].upper(), |
| description='{} block types'.format(kind), |
| )) |
| self.numberOfBlockTypes[kind] = NBLTYPES |
| if NBLTYPES>=2: |
| self.blockTypeCodes[kind] = self.readPrefixCode( |
| BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES)) |
| self.blockCountCodes[kind] = self.readPrefixCode( |
| BlockCountAlphabet('BC'+kind[0].upper())) |
| blockCount = self.verboseRead(self.blockCountCodes[kind]) |
| else: |
| blockCount = 1<<24 |
| self.currentBlockCounts[kind] = blockCount |
| |
| def readLiteralContextModes(self): |
| """Read literal context modes. |
| LSB6: lower 6 bits of last char |
| MSB6: upper 6 bits of last char |
| UTF8: roughly dependent on categories: |
| upper 4 bits depend on category of last char: |
| control/whitespace/space/ punctuation/quote/%/open/close/ |
| comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant |
| lower 2 bits depend on category of 2nd last char: |
| space/punctuation/digit or upper/lowercase |
| signed: hamming weight of last 2 chars |
| """ |
| print('Context modes'.center(60, '-')) |
| self.literalContextModes = [] |
| for i in range(self.numberOfBlockTypes[L]): |
| self.literalContextModes.append( |
| self.verboseRead(LiteralContextMode(number=i))) |
| |
| def contextMap(self, kind): |
| """Read context maps |
| Returns the number of differnt values on the context map |
| (In other words, the number of prefix trees) |
| """ |
| NTREES = self.verboseRead(TypeCountAlphabet( |
| kind[0].upper()+'T#', |
| description='{} prefix trees'.format(kind))) |
| mapSize = {L:64, D:4}[kind] |
| if NTREES<2: |
| self.cmaps[kind] = [0]*mapSize |
| else: |
| #read CMAPkind |
| RLEMAX = self.verboseRead(RLEmaxAlphabet( |
| 'RLE#'+kind[0].upper(), |
| description=kind+' context map')) |
| alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX) |
| cmapCode = self.readPrefixCode(alphabet) |
| tableSize = mapSize*self.numberOfBlockTypes[kind] |
| cmap = [] |
| while len(cmap)<tableSize: |
| cmapCode.description = 'map {}, entry {}'.format( |
| *divmod(len(cmap), mapSize)) |
| count, value = self.verboseRead(cmapCode) |
| cmap.extend([value]*count) |
| assert len(cmap)==tableSize |
| IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF')) |
| if IMTF: |
| self.IMTF(cmap) |
| if kind==L: |
| print('Context maps for literal data:') |
| for i in range(0, len(cmap), 64): |
| print(*( |
| ''.join(map(str, cmap[j:j+8])) |
| for j in range(i, i+64, 8) |
| )) |
| else: |
| print('Context map for distances:') |
| print(*( |
| ''.join(map('{:x}'.format, cmap[i:i+4])) |
| for i in range(0, len(cmap), 4) |
| )) |
| self.cmaps[kind] = cmap |
| return NTREES |
| |
| @staticmethod |
| def IMTF(v): |
| """In place inverse move to front transform. |
| """ |
| #mtf is initialized virtually with range(infinity) |
| mtf = [] |
| for i, vi in enumerate(v): |
| #get old value from mtf. If never seen, take virtual value |
| try: value = mtf.pop(vi) |
| except IndexError: value = vi |
| #put value at front |
| mtf.insert(0, value) |
| #replace transformed value |
| v[i] = value |
| |
| def readPrefixArray(self, kind, numberOfTrees): |
| """Read prefix code array""" |
| prefixes = [] |
| for i in range(numberOfTrees): |
| if kind==L: alphabet = LiteralAlphabet(i) |
| elif kind==I: alphabet = InsertAndCopyAlphabet(i) |
| elif kind==D: alphabet = DistanceAlphabet( |
| i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT) |
| self.readPrefixCode(alphabet) |
| prefixes.append(alphabet) |
| self.prefixCodes[kind] = prefixes |
| |
| #metablock data |
| def metablock(self): |
| """Process the data. |
| Relevant variables of self: |
| numberOfBlockTypes[kind]: number of block types |
| currentBlockTypes[kind]: current block types (=0) |
| literalContextModes: the context modes for the literal block types |
| currentBlockCounts[kind]: counters for block types |
| blockTypeCodes[kind]: code for block type |
| blockCountCodes[kind]: code for block count |
| cmaps[kind]: the context maps (not for I) |
| prefixCodes[kind][#]: the prefix codes |
| lastDistances: the last four distances |
| lastChars: the last two chars |
| output: the result |
| """ |
| print('Meta block contents'.center(60, '=')) |
| self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1} |
| self.lastDistances = deque([17,16,11,4], maxlen=4) |
| #the current context mode is for block type 0 |
| self.contextMode = ContextModeKeeper(self.literalContextModes[0]) |
| wordList = WordList() |
| |
| #setup distance callback function |
| def distanceCallback(symbol, extra): |
| "callback function for displaying decoded distance" |
| index, offset = symbol.value(extra) |
| if index: |
| #recent distance |
| distance = self.lastDistances[-index]+offset |
| return 'Distance: {}last{:+d}={}'.format(index, offset, distance) |
| #absolute value |
| if offset<=maxDistance: |
| return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset) |
| #word list value |
| action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen]) |
| return '{}-{} gives word {},{} action {}'.format( |
| offset, maxDistance, copyLen, word, action) |
| for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback |
| |
| blockLen = 0 |
| #there we go |
| while blockLen<self.MLEN: |
| #get insert© command |
| litLen, copyLen, dist0Flag = self.verboseRead( |
| self.prefixCodes[I][ |
| self.figureBlockType(I)]) |
| #literal data |
| for i in range(litLen): |
| bt = self.figureBlockType(L) |
| cm = self.contextMode.getIndex() |
| ct = self.cmaps[L][bt<<6|cm] |
| char = self.verboseRead( |
| self.prefixCodes[L][ct], |
| context='{},{}='.format(bt,cm)) |
| self.contextMode.add(char) |
| self.output.append(char) |
| blockLen += litLen |
| #check if we're done |
| if blockLen>=self.MLEN: return |
| #distance |
| #distances are computed relative to output length, at most window size |
| maxDistance = min(len(self.output), self.windowSize) |
| if dist0Flag: |
| distance = self.lastDistances[-1] |
| else: |
| bt = self.figureBlockType(D) |
| cm = {2:0, 3:1, 4:2}.get(copyLen, 3) |
| ct = self.cmaps[D][bt<<2|cm] |
| index, offset = self.verboseRead( |
| self.prefixCodes[D][ct], |
| context='{},{}='.format(bt,cm)) |
| distance = self.lastDistances[-index]+offset if index else offset |
| if index==1 and offset==0: |
| #to make sure distance is not put in last distance list |
| dist0Flag = True |
| if distance<=maxDistance: |
| #copy from output |
| for i in range( |
| maxDistance-distance, |
| maxDistance-distance+copyLen): |
| self.output.append(self.output[i]) |
| if not dist0Flag: self.lastDistances.append(distance) |
| comment = 'Seen before' |
| else: |
| #fetch from wordlist |
| newWord = wordList.word(copyLen, distance-maxDistance-1) |
| self.output.extend(newWord) |
| #adjust copyLen to reflect actual new data |
| copyLen = len(newWord) |
| comment = 'From wordlist' |
| blockLen += copyLen |
| print(' '*40, |
| comment, |
| ': "', |
| outputFormatter(self.output[-copyLen:]), |
| '"', |
| sep='') |
| self.contextMode.add(self.output[-2]) |
| self.contextMode.add(self.output[-1]) |
| |
| def figureBlockType(self, kind): |
| counts, types = self.currentBlockCounts, self.currentBlockTypes |
| if counts[kind]==0: |
| newType = self.verboseRead(self.blockTypeCodes[kind]) |
| if newType==-2: newType = types['P'+kind] |
| elif newType==-1: |
| newType = (types[kind]+1)%self.numberOfBlockTypes[kind] |
| types['P'+kind] = types[kind] |
| types[kind] = newType |
| counts[kind] = self.verboseRead(self.blockCountCodes[kind]) |
| counts[kind] -=1 |
| return types[kind] |
| |
| __test__ = { |
| 'BitStream': """ |
| >>> bs = BitStream(b'Jurjen') |
| >>> bs.readBytes(2) |
| b'Ju' |
| >>> bs.read(6) #r=01110010 |
| 50 |
| >>> bs |
| BitStream(pos=2:6) |
| >>> bs.peek(5) #j=01101010 |
| 9 |
| >>> bs.readBytes(2) |
| Traceback (most recent call last): |
| ... |
| ValueError: readBytes: need byte boundary |
| """, |
| |
| 'Symbol': """ |
| >>> a=Symbol(MetablockLengthAlphabet(),5) |
| >>> len(a) |
| 2 |
| >>> int(a) |
| 5 |
| >>> a.bitPattern() |
| '01' |
| >>> a.value(200000) |
| 200001 |
| >>> a.explanation(300000) |
| 'data length: 493e0h+1=300001' |
| """, |
| |
| 'RangeDecoder': """ |
| >>> a=RangeDecoder(bitLength=3) |
| >>> len(a) |
| 8 |
| >>> a.name='t' |
| >>> list(a) |
| [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)] |
| >>> a[2] |
| Symbol(t, 2) |
| >>> a.bitPattern(4) |
| '100' |
| >>> a.length(2) |
| 3 |
| >>> a.decodePeek(15) |
| (3, Symbol(t, 7)) |
| >>> |
| |
| """, |
| |
| 'PrefixDecoder': """ |
| >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4}) |
| >>> len(a) |
| 4 |
| >>> a.name='t' |
| >>> list(a) |
| [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)] |
| >>> a.decodePeek(22) |
| (1, Symbol(t, 1)) |
| >>> a.decodePeek(27) |
| (3, Symbol(t, 3)) |
| >>> a.length(1) |
| 1 |
| >>> a.length(4) |
| 3 |
| """, |
| |
| 'Code': """ |
| >>> a=Code('t',alphabetSize=10) |
| >>> len(a) |
| 10 |
| >>> a.showCode() |
| 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9 |
| >>> a.setLength({2:1,3:2,5:3,6:3}) |
| >>> a.showCode() |
| 0:2 01:3 011:5 111:6 |
| >>> len(a) |
| 4 |
| >>> def callback(i): return 'call{}back'.format(i) |
| >>> a=Code('t',callback=callback,bitLength=3) |
| >>> a[6].explanation() |
| 'call6back' |
| """, |
| |
| 'WithExtra': """ |
| >>> class A(WithExtra): |
| ... extraTable = [0,1,1,2,2] |
| >>> a=A('t',alphabetSize=5) |
| >>> a[1] |
| Symbol(t, 1) |
| >>> a.extraBits(2) |
| 1 |
| >>> a.mnemonic(4) |
| '4' |
| >>> a.readTupleAndExtra(BitStream(b'\x5b')) |
| (3, Symbol(t, 3), 2, 3) |
| """, |
| |
| 'BoolCode': """ |
| >>> BoolCode('test')[0].explanation() |
| '0: False' |
| """, |
| |
| 'Enumerator': """ |
| >>> class A(Enumerator): |
| ... extraTable = [0,1,1,2,2] |
| ... value0=3 |
| >>> a=A(alphabetLength=5) |
| >>> a.value(3) |
| Traceback (most recent call last): |
| ... |
| TypeError: value() missing 1 required positional argument: 'extra' |
| >>> a.explanation(3,4) |
| 'xx 011: 8-11; 8+4=12' |
| """, |
| |
| 'WindowSizeAlphabet': """ |
| >>> windowSizeAlphabet = WindowSizeAlphabet() |
| >>> windowSizeAlphabet[0] |
| Traceback (most recent call last): |
| ... |
| ValueError: No symbol WindowSizeAlphabet[0] |
| >>> len(windowSizeAlphabet) |
| 16 |
| >>> windowSizeAlphabet[21] |
| Symbol(WSIZE, 21) |
| >>> windowSizeAlphabet[21].bitPattern() |
| '1001' |
| >>> windowSizeAlphabet[21].extraBits() |
| 0 |
| >>> windowSizeAlphabet[21].index |
| 21 |
| >>> windowSizeAlphabet[10].value() |
| 1008 |
| >>> windowSizeAlphabet[10].explanation() |
| 'windowsize=(1<<10)-16=1008' |
| >>> windowSizeAlphabet.showCode() |
| 0:65520 1100001:16368 1110001:32752 0011:262128 |
| 0000001:131056 0010001:None 1001:2097136 1011:4194288 |
| 1000001:4080 1010001:8176 0101:524272 0111:1048560 |
| 0100001:1008 0110001:2032 1101:8388592 1111:16777200 |
| """, |
| |
| 'TypeCountAlphabet': """ |
| >>> typeCountAlphabet = TypeCountAlphabet(description='bananas') |
| >>> len(typeCountAlphabet) |
| 9 |
| >>> typeCountAlphabet[3] |
| Symbol(BT#, 3) |
| >>> typeCountAlphabet[9] |
| Traceback (most recent call last): |
| ... |
| ValueError: No symbol TypeCountAlphabet[9] |
| >>> print(typeCountAlphabet[3]) |
| xx,0101 |
| >>> typeCountAlphabet[8].value(127) |
| 256 |
| >>> typeCountAlphabet[4].explanation(2) |
| 'xxx,0111: 11 bananas' |
| >>> typeCountAlphabet[0].explanation() |
| '0: 1 banana' |
| """, |
| |
| 'DistanceParamAlphabet': """ |
| >>> dpa = DistanceParamAlphabet() |
| >>> dpa.showCode() |
| 00:PF0 01:PF1 10:PF2 11:PF3 |
| >>> dpa.readTupleAndExtra(BitStream(b'\\x29')) |
| (2, Symbol(DIST, 1), 4, 10) |
| >>> dpa.explanation(2, 5) |
| '2 postfix bits and 0101<<2=20 direct codes' |
| """, |
| |
| 'LiteralAlphabet': """ |
| >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS |
| 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0 |
| 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1 |
| 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2 |
| ... |
| 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff |
| 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc |
| 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd |
| 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce |
| 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf |
| """, |
| |
| 'BlockCountAlphabet': """ |
| >>> bc=BlockCountAlphabet('BCL') |
| >>> len(bc) |
| 26 |
| >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02') |
| >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) |
| 'Block count: xx 00000: 1-4; 1+2=3' |
| >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) |
| 'Block count: xxx 00110: 33-40; 33+0=33' |
| >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) |
| 'Block count: xxxxxx 10001: 305-368; 305+28=333' |
| >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) |
| 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333' |
| """, |
| |
| 'Layout': """ |
| >>> olleke.pos = 0 |
| >>> l = Layout(olleke) |
| >>> l.verboseRead(WindowSizeAlphabet()) |
| 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| 4194288 |
| >>> l.verboseRead(BoolCode('LAST', description="Last block")) |
| 1 LAST Last block: 1: True |
| True |
| >>> l.verboseRead(BoolCode('EMPTY', description="Empty block")) |
| 0 EMPTY Empty block: 0: False |
| False |
| >>> l.verboseRead(MetablockLengthAlphabet()) |
| 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 |
| 47 |
| >>> olleke.pos = 76 |
| >>> l = Layout(olleke) |
| >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True) |
| 000a 82 10|1100 D0 10[15*x]-3 |
| >>> x.explanation(0x86a3) |
| '10[1000011010100011]-3: [0]+100000' |
| """, |
| |
| 'olleke': """ |
| >>> olleke.pos = 0 |
| >>> try: Layout(olleke).processStream() |
| ... except NotImplementedError: pass |
| ... #doctest: +REPORT_NDIFF |
| addr hex binary context explanation |
| -----------------------Stream header------------------------ |
| 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| ======================Metablock header====================== |
| 1 LAST Last block: 1: True |
| 0 EMPTY Empty block: 0: False |
| 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 |
| -------------------Block type descriptors------------------- |
| 0003 00 0 BT#L 0: 1 literal block type |
| 0 BT#I 0: 1 insert© block type |
| 0 BT#D 0: 1 distance block type |
| ------------------Distance code parameters------------------ |
| 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes |
| -----------------------Context modes------------------------ |
| 10 LC0 Context mode for type 0: 2(UTF8) |
| ------------------------Context maps------------------------ |
| 0 LT# 0: 1 literal prefix tree |
| 0 DT# 0: 1 distance prefix tree |
| ---------------------Prefix code lists---------------------- |
| 10 PFX L0 is complex with lengths 3,4,0,5,17... |
| 0005 4f 1|0 ##L0 len 3: coded with 3 bits |
| 0111 ##L0 len 4: coded with 1 bits |
| 10 ##L0 unused: coded with 3 bits |
| 0006 d6 0|0 ##L0 len 5: skipped |
| 011 ##L0 zero xxx: coded with 2 bits |
| ***** Lengths for L0 will be coded as: |
| 0:len 4 01:zero xxx 011:unused 111:len 3 |
| 0007 95 1|11,01 #L0 7+3 unused |
| 0 #L0 Length for \\n is 4 bits |
| 001,01 #L0 1+3 unused |
| 0008 44 010,0|1 #L0 total 19+2 unused |
| 0 #L0 Length for " " is 4 bits |
| 0 #L0 Length for ! is 4 bits |
| 0009 cb 011,|01 #L0 3+3 unused |
| |110,01 #L0 total 35+6 unused |
| 000a 82 0 #L0 Length for K is 4 bits |
| 000,01 #L0 0+3 unused |
| 0 #L0 Length for O is 4 bits |
| 000b 4d 01|1 #L0 symbol P unused |
| 011 #L0 symbol Q unused |
| 0 #L0 Length for R is 4 bits |
| 000c 88 000,|01 #L0 0+3 unused |
| |100,01 #L0 total 11+4 unused |
| 000d b6 0 #L0 Length for b is 4 bits |
| 011 #L0 symbol c unused |
| 011 #L0 symbol d unused |
| 000e 27 11|1 #L0 Length for e is 3 bits |
| 010,01 #L0 2+3 unused |
| |0 #L0 Length for k is 4 bits |
| 000f 1f 111 #L0 Length for l is 3 bits |
| 011 #L0 symbol m unused |
| 0 #L0 Length for n is 4 bits |
| |0 #L0 Length for o is 4 bits |
| 0010 c1 000,01 #L0 0+3 unused |
| 0 #L0 Length for s is 4 bits |
| 0011 b4 0|11 #L0 symbol t unused |
| 0 #L0 Length for u is 4 bits |
| End of table. Prefix code L0: |
| 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s |
| 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u |
| 11,01 PFX IC0 is simple with 4 code words |
| 0012 2a |2Ah|10 IC0 ? bits: I5C4 |
| 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7 |
| 0015 22 0010|111011 IC0 ? bits: I8+xC5 |
| 0016 8c 001100|0010 IC0 ? bits: I0C14+xx |
| 0 SHAPE False: lengths 2,2,2,2 |
| 0017 74 10,0|1 PFX D0 is simple with 3 code words |
| 0018 a6 0|01110 D0 1 bit: 2last-3 |
| 010011 D0 2 bits: 11xx-3 |
| 0019 aa 01010|1 D0 2 bits: 11xxx-3 |
| ====================Meta block contents===================== |
| |1,01 IC0 Literal: 9, copy: 5 |
| 001a 41 0001 0,0=L0 O |
| 100 0,48=L0 l |
| 001b a2 10|0 0,62=L0 l |
| 000 0,63=L0 e |
| 001c a1 1|101 0,59=L0 k |
| 000 0,63=L0 e |
| |1010 0,59=L0 " " |
| 001d b5 0101 0,11=L0 b |
| |1011 0,60=L0 o |
| 001e 24 0 0,3=D0 Distance: 2last-3=8 |
| Seen before: "lleke" |
| 0,10 IC0 Literal: 6, copy: 7 |
| |0010 0,59=L0 \\n |
| 001f 89 1001 0,7=L0 R |
| 000 0,52=L0 e |
| 0020 fa 010|1 0,58=L0 b |
| 1111 0,63=L0 u |
| 0021 eb 011|1 0,59=L0 s |
| 11,01 0,3=D0 Absolute value: 12 (pos 8) |
| Seen before: "olleke\\n" |
| 0022 db 01,1|1 IC0 Literal: 0, copy: 15 |
| |110,11 0,3=D0 Absolute value: 27 (pos 0) |
| Seen before: "Olleke bolleke\\n" |
| 0023 f8 00 IC0 Literal: 5, copy: 4 |
| 1110 0,7=L0 K |
| 0024 2c 00|11 0,52=L0 n |
| 1011 0,62=L0 o |
| 0025 0d 1|00 0,59=L0 l |
| 0110 0,63=L0 ! |
| """, |
| |
| 'file': """ |
| >>> try: Layout(BitStream( |
| ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb') |
| ... .read())).processStream() |
| ... except NotImplementedError: pass |
| addr hex binary context explanation |
| -----------------------Stream header------------------------ |
| 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| ======================Metablock header====================== |
| 1 LAST Last block: 1: True |
| 0 EMPTY Empty block: 0: False |
| 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 |
| -------------------Block type descriptors------------------- |
| 0003 00 0 BT#L 0: 1 literal block type |
| 0 BT#I 0: 1 insert© block type |
| 0 BT#D 0: 1 distance block type |
| ------------------Distance code parameters------------------ |
| 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes |
| -----------------------Context modes------------------------ |
| 10 LC0 Context mode for type 0: 2(UTF8) |
| ------------------------Context maps------------------------ |
| 0 LT# 0: 1 literal prefix tree |
| 0 DT# 0: 1 distance prefix tree |
| ---------------------Prefix code lists---------------------- |
| 0005 b0 0|1,01 PFX L0 is simple with 2 code words |
| 0006 b2 0|1011000 L0 1 bit: X |
| 0007 ea 0|1011001 L0 1 bit: Y |
| 01,01 PFX IC0 is simple with 2 code words |
| 0008 81 0000001|111 IC0 1 bit: I1C9&D=0 |
| 0009 47 02 0|47h|1 IC0 1 bit: I1C9 |
| 00,01 PFX D0 is simple with 1 code word |
| 000b 8a 010|000 D0 0 bits: 10x-3 |
| ====================Meta block contents===================== |
| 1 IC0 Literal: 1, copy: 9 |
| 0 0,0=L0 X |
| 0,() 0,3=D0 Absolute value: 1 (pos 0) |
| Seen before: "XXXXXXXXX" |
| 0 IC0 Literal: 1, copy: 9, same distance |
| |1 0,54=L0 Y |
| Seen before: "YYYYYYYYY" |
| """, |
| |
| 'XY': """ |
| >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream() |
| ... except NotImplementedError: pass |
| addr hex binary context explanation |
| -----------------------Stream header------------------------ |
| 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 |
| ======================Metablock header====================== |
| 1 LAST Last block: 1: True |
| 0 EMPTY Empty block: 0: False |
| 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 |
| -------------------Block type descriptors------------------- |
| 0003 00 0 BT#L 0: 1 literal block type |
| 0 BT#I 0: 1 insert© block type |
| 0 BT#D 0: 1 distance block type |
| ------------------Distance code parameters------------------ |
| 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes |
| -----------------------Context modes------------------------ |
| 10 LC0 Context mode for type 0: 2(UTF8) |
| ------------------------Context maps------------------------ |
| 0 LT# 0: 1 literal prefix tree |
| 0 DT# 0: 1 distance prefix tree |
| ---------------------Prefix code lists---------------------- |
| 0005 b0 0|1,01 PFX L0 is simple with 2 code words |
| 0006 b2 0|1011000 L0 1 bit: X |
| 0007 82 0|1011001 L0 1 bit: Y |
| 00,01 PFX IC0 is simple with 1 code word |
| 0008 84 0000100|100 IC0 0 bits: I4C6&D=0 |
| 0009 00 00,0|1 PFX D0 is simple with 1 code word |
| 000a e0 0|00000 D0 0 bits: last |
| ====================Meta block contents===================== |
| () IC0 Literal: 4, copy: 6, same distance |
| 0 0,0=L0 X |
| 0 0,52=L0 X |
| 0 0,54=L0 X |
| 0 0,54=L0 X |
| Seen before: "XXXXXX" |
| () IC0 Literal: 4, copy: 6, same distance |
| 1 0,54=L0 Y |
| 1 0,54=L0 Y |
| |1 0,54=L0 Y |
| 000b 01 1 0,54=L0 Y |
| Seen before: "YYYYYY" |
| """, |
| |
| 'empty': """ |
| >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream() |
| ... except NotImplementedError: pass |
| addr hex binary context explanation |
| -----------------------Stream header------------------------ |
| 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056 |
| ======================Metablock header====================== |
| |1 LAST Last block: 1: True |
| 0001 16 0 EMPTY Empty block: 0: False |
| 11 MLEN 11: empty block |
| 0 RSVD Reserved (must be zero) |
| 0002 00 000000|00,01 SKIP skip length: 0h+1=1 |
| |00 SKIP 2 bits ignored |
| Skipping to 4 |
| """, |
| |
| } |
| |
| if __name__=='__main__': |
| import sys |
| if len(sys.argv)>1: |
| l = Layout(BitStream(open(sys.argv[1],'rb').read())) |
| l.processStream() |
| else: |
| sys.path.append("h:/Persoonlijk/bin") |
| try: |
| import brotli |
| open('brotlidump.br', 'wb').write( |
| brotli.compress( |
| open('brotlidump.py', 'r').read() |
| )) |
| olleke = BitStream(brotli.compress( |
| 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!')) |
| except ImportError: pass |
| import doctest |
| doctest.testmod(optionflags=doctest.REPORT_NDIFF |
| #|doctest.FAIL_FAST |
| ) |