blob: 0d588eec15ecd64bdb8ad272e140b50861e3e3d7 [file] [log] [blame]
// Copyright 2018 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build ignore
package main
// print-lzw-example.go prints a worked example discussed in std/lzw/README.md,
// based on three implementations of a simplified core of the LZW algorithm.
// Simplifications include assuming LSB-first, 8 bit literals, no clear codes,
// no full tables and that the input is given in terms of a code stream, not a
// bit stream.
//
// The implementations are designed for ease of studying, and for e.g. a
// minimal diff between the suf1 and sufQ implementations, and are not
// optimized for performance.
//
// Usage: go run print-lzw-example.go
//
// You can also play with the code (e.g. modify Q and re-run) online:
// https://play.golang.org/p/1aLC_XHZzoA
import (
"fmt"
"os"
)
const (
expectedOutput = "TOBEORNOTTOBEORTOBEORNOTXOTXOTXOOTXOOOTXOOOTOBEY"
clearCode = 0x100
endCode = 0x101
// Neither clearCode or endCode can ever be a valid prefix code. We
// arbitrarily pick clearCode to represent "no prefix".
noPrefix = clearCode
// Q is the maximum possible suffix length, in bytes, which equals 1 plus
// the maximum possible "suffix length minus 1". Q == 1 means that sufQ is
// equivalent to suf1.
//
// In std/lzw/README.md, Q is 3 to keep the example short and simple. In
// practice, especially on modern CPUs that can do unaligned 32 or 64 bit
// loads and stores, Q is 4 or 8.
Q = 3
)
var codes = []int{
'T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T',
0x102, 0x104, 0x106, 0x10B, 0x105, 0x107, 0x109,
'X',
0x111, 0x113, 0x114, 0x115, 0x10E,
'Y',
0x101,
}
func codeString(code int) string {
if code < clearCode {
return fmt.Sprintf("%5c", code)
} else if code == noPrefix {
return " -"
}
return fmt.Sprintf("0x%03X", code)
}
func main() {
if err := main1(); err != nil {
os.Stderr.WriteString(err.Error() + "\n")
os.Exit(1)
}
}
func main1() error {
n, s, q := new(naive), new(suf1), new(sufQ)
decode(n)
decode(s)
decode(q)
key := endCode
fmt.Printf(" Code Emits Key Value Pre1+Suf1 LM1 /Q %%Q PreQ+SufQ\n")
for _, code := range codes {
// Code
fmt.Printf("%s", codeString(code))
// Emits
if code == endCode {
fmt.Printf(" -end-\n")
break
}
fmt.Printf(" %6s", n[code])
if key != endCode {
fmt.Printf(" 0x%03X %7s %s %c %3d %3d %3d %s %s",
key, // Key
n[key], // Value
codeString(int(s.prefixes[key])), // Pre1
s.suffixes[key], // Suf1
q.lengthM1s[key], // LM1
q.lengthM1s[key]/Q, // /Q
q.lengthM1s[key]%Q, // %Q
codeString(int(q.prefixes[key])), // PreQ
string(q.suffixes[key][:]), // SufQ
)
}
fmt.Println()
key++
}
return nil
}
type implementation interface {
initialize()
emit(buffer []byte, code int, prevCode int, n int) []byte
}
func decode(impl implementation) {
impl.initialize()
output, prevCode, n := []byte(nil), -1, 0x101
for _, code := range codes {
if code == clearCode {
panic("unimplemented clearCode")
} else if code == endCode {
if string(output) != expectedOutput {
panic("unexpected output")
}
return
} else if code > n {
panic("invalid code")
}
// We have a literal or copy code.
output = impl.emit(output, code, prevCode, n)
prevCode, n = code, n+1
if n >= 4096 {
panic("unimplemented table-is-full")
}
}
panic("missing endCode")
}
// naive is a simple implementation that, in the worst case, requires O(N*N)
// memory.
type naive [4096]string
func (t *naive) initialize() {
for i := 0; i < clearCode; i++ {
t[i] = string([]byte{byte(i)})
}
}
func (t *naive) emit(output []byte, code int, prevCode int, n int) []byte {
if prevCode >= 0 {
prevOutput := t[prevCode]
if code < n {
t[n] = prevOutput + t[code][:1]
} else {
t[n] = prevOutput + prevOutput[:1]
}
}
return append(output, t[code]...)
}
// suf1 keeps separate prefix and suffix tables, 1 byte per suffix.
type suf1 struct {
prefixes [4096]uint16
suffixes [4096]byte
}
func (t *suf1) initialize() {
for i := 0; i < clearCode; i++ {
t.prefixes[i] = noPrefix
t.suffixes[i] = byte(i)
}
}
func (t *suf1) emit(output []byte, code int, prevCode int, n int) []byte {
// Fill an intermediate buffer from back to front, 1 byte at a time.
buffer := [4096]byte{}
i := len(buffer)
c := code
if code == n {
c = prevCode
i--
// buffer[4095] will be set later.
}
firstByte := byte(0)
for {
suffix := t.suffixes[c]
i--
buffer[i] = suffix
c = int(t.prefixes[c])
if c == noPrefix {
firstByte = suffix
break
}
}
if code == n {
buffer[4095] = firstByte
}
// The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of
// 0, is correct conceptually, but unnecessary in practice. Look for "fake
// key-value pair" in std/lzw/README.md.
if prevCode >= 0 {
t.prefixes[n] = uint16(prevCode)
t.suffixes[n] = firstByte
}
return append(output, buffer[i:4096]...)
}
// sufQ keeps separate prefix and suffix tables, up to Q bytes per suffix.
type sufQ struct {
lengthM1s [4096]uint16
prefixes [4096]uint16
suffixes [4096][Q]byte
}
func (t *sufQ) initialize() {
for i := 0; i < clearCode; i++ {
t.lengthM1s[i] = 0
t.prefixes[i] = noPrefix
t.suffixes[i][0] = byte(i)
for j := 1; j < Q; j++ {
// Unnecessary for correct output, but makes the printed table prettier.
t.suffixes[i][j] = '.'
}
}
}
func (t *sufQ) emit(output []byte, code int, prevCode int, n int) []byte {
// Fill an intermediate buffer from back to front, Q bytes at a time.
buffer := [4096 + Q - 1]byte{}
i := len(buffer)
c := code
if code == n {
c = prevCode
i--
// buffer[4095] will be set later.
}
i -= int(t.lengthM1s[c] % Q)
firstByte := byte(0)
for {
suffix := t.suffixes[c]
i -= Q
copy(buffer[i:i+Q], suffix[:])
c = int(t.prefixes[c])
if c == noPrefix {
firstByte = suffix[0]
break
}
}
if code == n {
buffer[4095] = firstByte
}
// The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of
// 0, is correct conceptually, but unnecessary in practice. Look for "fake
// key-value pair" in std/lzw/README.md.
if prevCode >= 0 {
lm1 := t.lengthM1s[prevCode] + 1
t.lengthM1s[n] = lm1
lm1 %= Q
if lm1 != 0 {
// Copy the prevCode's prefix and suffix, and then extend the suffix.
t.prefixes[n] = t.prefixes[prevCode]
t.suffixes[n] = t.suffixes[prevCode]
t.suffixes[n][lm1] = firstByte
} else {
// Start a new suffix.
t.prefixes[n] = uint16(prevCode)
t.suffixes[n][0] = firstByte
for j := 1; j < Q; j++ {
// Unnecessary for correct output, but makes the printed table prettier.
t.suffixes[n][j] = '.'
}
}
}
return append(output, buffer[i:4096]...)
}