blob: f6623c449f46aa7664b83a1d2f2ffdb4942cc97b [file] [log] [blame]
// Copyright 2017 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package token
import (
"errors"
"fmt"
"unicode/utf8"
)
const (
maxID = 1048575
maxLine = 1048575
maxTokenSize = 1023
)
var backslashes = [256]byte{
'"': 0x22 | 0x80,
'\'': 0x27 | 0x80,
'/': 0x2F | 0x80,
'0': 0x00 | 0x80,
'?': 0x3F | 0x80,
'\\': 0x5C | 0x80,
'a': 0x07 | 0x80,
'b': 0x08 | 0x80,
'e': 0x1B | 0x80,
'f': 0x0C | 0x80,
'n': 0x0A | 0x80,
'r': 0x0D | 0x80,
't': 0x09 | 0x80,
'v': 0x0B | 0x80,
}
func Unescape(s string) (unescaped string, ok bool) {
if len(s) < 2 {
return "", false
}
switch s[0] {
case '"':
if s[len(s)-1] == '"' {
s = s[1 : len(s)-1]
} else {
return "", false
}
case '\'':
if s[len(s)-1] == '\'' {
s = s[1 : len(s)-1]
} else if (len(s) >= 4) && (s[len(s)-3] == '\'') &&
((s[len(s)-2] == 'b') || (s[len(s)-2] == 'l')) &&
(s[len(s)-1] == 'e') { // "be" or "le" suffix.
s = s[1 : len(s)-3]
} else {
return "", false
}
default:
return "", false
}
for i := 0; ; i++ {
if i == len(s) {
// There were no backslashes.
return s, true
}
if s[i] == '\\' {
break
}
}
// There were backslashes.
b := make([]byte, 0, len(s))
for i := 0; i < len(s); {
if s[i] != '\\' {
b = append(b, s[i])
i += 1
continue
} else if i >= (len(s) - 1) {
// No-op.
} else if x := backslashes[s[i+1]]; x != 0 {
b = append(b, x&0x7F)
i += 2
continue
} else if (s[i+1] == 'x') && (i < (len(s) - 3)) {
u0 := unhex(s[i+2])
u1 := unhex(s[i+3])
u := (u0 << 4) | u1
if 0 <= u {
b = append(b, uint8(u))
i += 4
continue
}
} else if (s[i+1] == 'u') && (i < (len(s) - 5)) {
u0 := unhex(s[i+2])
u1 := unhex(s[i+3])
u2 := unhex(s[i+4])
u3 := unhex(s[i+5])
u := (u0 << 12) | (u1 << 8) | (u2 << 4) | u3
if (u >= 0) && utf8.ValidRune(u) {
e := [utf8.UTFMax]byte{}
n := utf8.EncodeRune(e[:], u)
b = append(b, e[:n]...)
i += 6
continue
}
} else if (s[i+1] == 'U') && (i < (len(s) - 9)) {
u0 := unhex(s[i+2])
u1 := unhex(s[i+3])
u2 := unhex(s[i+4])
u3 := unhex(s[i+5])
u4 := unhex(s[i+6])
u5 := unhex(s[i+7])
u6 := unhex(s[i+8])
u7 := unhex(s[i+9])
u := (u0 << 28) | (u1 << 24) | (u2 << 20) | (u3 << 16) |
(u4 << 12) | (u5 << 8) | (u6 << 4) | u7
if (u >= 0) && utf8.ValidRune(u) {
e := [utf8.UTFMax]byte{}
n := utf8.EncodeRune(e[:], u)
b = append(b, e[:n]...)
i += 10
continue
}
}
return "", false
}
return string(b), true
}
type Map struct {
byName map[string]ID
byID []string
}
func (m *Map) Insert(name string) (ID, error) {
if name == "" {
return 0, nil
}
if id, ok := builtInsByName[name]; ok {
return id, nil
}
if m.byName == nil {
m.byName = map[string]ID{}
}
if id, ok := m.byName[name]; ok {
return id, nil
}
id := nBuiltInIDs + ID(len(m.byID))
if id > maxID {
return 0, errors.New("token: too many distinct tokens")
}
m.byName[name] = id
m.byID = append(m.byID, name)
return id, nil
}
func (m *Map) ByName(name string) ID {
if id, ok := builtInsByName[name]; ok {
return id
}
if m.byName != nil {
return m.byName[name]
}
return 0
}
func (m *Map) ByID(x ID) string {
if x < nBuiltInIDs {
return builtInsByID[x]
}
x -= nBuiltInIDs
if uint(x) < uint(len(m.byID)) {
return m.byID[x]
}
return ""
}
func unhex(c byte) int32 {
switch {
case 'A' <= c && c <= 'F':
return int32(c) - ('A' - 10)
case 'a' <= c && c <= 'f':
return int32(c) - ('a' - 10)
case '0' <= c && c <= '9':
return int32(c) - '0'
}
return -1
}
func alpha(c byte) bool {
return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || (c == '_')
}
func alphaNumeric(c byte) bool {
return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || (c == '_') || ('0' <= c && c <= '9')
}
func hexaNumericUnderscore(c byte) bool {
return ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f') || (c == '_') || ('0' <= c && c <= '9')
}
func zeroOneUnderscore(c byte) bool {
return (c == '_') || ('0' <= c && c <= '1')
}
func numeric(c byte) bool {
return ('0' <= c && c <= '9')
}
func numericUnderscore(c byte) bool {
return (c == '_') || ('0' <= c && c <= '9')
}
func hasPrefix(a []byte, s string) bool {
if len(s) == 0 {
return true
}
if len(a) < len(s) {
return false
}
a = a[:len(s)]
for i := range a {
if a[i] != s[i] {
return false
}
}
return true
}
// checkNumericUnderscores rejects consecutive or trailing underscores.
func checkNumericUnderscores(a []byte) bool {
prevUnderscore := false
for _, c := range a {
currUnderscore := c == '_'
if prevUnderscore && currUnderscore {
return false
}
prevUnderscore = currUnderscore
}
return !prevUnderscore
}
func Tokenize(m *Map, filename string, src []byte) (tokens []Token, comments []string, retErr error) {
line := uint32(1)
loop:
for i := 0; i < len(src); {
c := src[i]
if c <= ' ' {
if c == '\n' {
if len(tokens) > 0 && tokens[len(tokens)-1].ID.IsImplicitSemicolon(m) {
tokens = append(tokens, Token{IDSemicolon, line})
}
if line == maxLine {
return nil, nil, fmt.Errorf("token: too many lines in %q", filename)
}
line++
}
i++
continue
}
if (c == '"') || (c == '\'') {
quote := c
j := i + 1
for j < len(src) {
c = src[j]
j++
if c == quote {
break
} else if c == '\\' {
if quote == '"' {
return nil, nil, fmt.Errorf("token: backslash in \"-string at %s:%d", filename, line)
}
} else if c == '\n' {
return nil, nil, fmt.Errorf("token: expected final %c in string at %s:%d", quote, filename, line)
} else if c < ' ' {
return nil, nil, fmt.Errorf("token: control character in string at %s:%d", filename, line)
}
}
hasEndian := (quote == '\'') && (j < (len(src) - 2)) &&
((src[j] == 'b') || (src[j] == 'l')) &&
(src[j+1] == 'e')
if hasEndian {
j += 2
}
if j-i > maxTokenSize {
return nil, nil, fmt.Errorf("token: string too long at %s:%d", filename, line)
}
s := string(src[i:j])
if quote == '\'' {
if unescaped, ok := Unescape(s); !ok {
return nil, nil, fmt.Errorf("token: invalid '-string at %s:%d", filename, line)
} else if (len(unescaped) > 1) && !hasEndian {
return nil, nil, fmt.Errorf("token: multi-byte '-string needs be or le suffix at %s:%d", filename, line)
}
}
id, err := m.Insert(s)
if err != nil {
return nil, nil, err
}
tokens = append(tokens, Token{id, line})
i = j
continue
}
if alpha(c) {
j := i + 1
for ; j < len(src) && alphaNumeric(src[j]); j++ {
if j-i == maxTokenSize {
return nil, nil, fmt.Errorf("token: identifier too long at %s:%d", filename, line)
}
}
id, err := m.Insert(string(src[i:j]))
if err != nil {
return nil, nil, err
}
tokens = append(tokens, Token{id, line})
i = j
continue
}
if numeric(c) {
// TODO: 0b11 binary numbers.
j, isDigit := i+1, numericUnderscore
if c == '0' && j < len(src) {
if next := src[j]; next == 'x' || next == 'X' {
j, isDigit = j+1, hexaNumericUnderscore
} else if next == 'b' || next == 'B' {
j, isDigit = j+1, zeroOneUnderscore
} else if numeric(next) {
return nil, nil, fmt.Errorf("token: legacy octal syntax at %s:%d", filename, line)
}
}
for ; j < len(src) && isDigit(src[j]); j++ {
if j-i == maxTokenSize {
return nil, nil, fmt.Errorf("token: constant too long at %s:%d", filename, line)
}
}
if !checkNumericUnderscores(src[i:j]) {
return nil, nil, fmt.Errorf("token: invalid numeric literal at %s:%d", filename, line)
}
id, err := m.Insert(string(src[i:j]))
if err != nil {
return nil, nil, err
}
tokens = append(tokens, Token{id, line})
i = j
continue
}
if c == '/' && i+1 < len(src) && src[i+1] == '/' {
h := i
i += 2
for ; i < len(src) && src[i] != '\n'; i++ {
}
for uint32(len(comments)) < line {
comments = append(comments, "")
}
comments = append(comments, string(src[h:i]))
continue
}
if id := squiggles[c]; id != 0 {
i++
tokens = append(tokens, Token{id, line})
continue
}
for _, x := range lexers[c] {
if hasPrefix(src[i+1:], x.suffix) {
i += len(x.suffix) + 1
tokens = append(tokens, Token{x.id, line})
continue loop
}
}
msg := ""
if c <= 0x7F {
msg = fmt.Sprintf("byte '\\x%02X' (%q)", c, c)
} else {
msg = fmt.Sprintf("non-ASCII byte '\\x%02X'", c)
}
return nil, nil, fmt.Errorf("token: unrecognized %s at %s:%d", msg, filename, line)
}
return tokens, comments, nil
}