blob: 64746cfba048858f585b60e6a1208e57c68d769d [file] [log] [blame]
// Copyright 2020 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub struct decoder? implements base.token_decoder(
)(
// stack is conceptually an array of bits, implemented as an array of u32.
// The N'th bit being 0 or 1 means that we're in an array or object, where
// N is the recursion depth.
//
// Parsing JSON involves recursion: containers (arrays and objects) can
// hold other containers. As child elements are completed, the parser needs
// to remember 1 bit of state per recursion depth: whether the parent
// container was an array or an object. When continuing to parse the
// parent's elements, `, "key": value` is only valid for objects.
//
// Note that we explicitly track our own stack and depth. We do not use the
// call stack to hold this state and the decoder.decode_tokens function is
// not recursive per se.
//
// Wuffs code does not have the capability to dynamically allocate memory,
// so the maximum depth is hard-coded at compile time. In this case, the
// maximum is 1024 (stack is 1024 bits or 128 bytes).
//
// The [JSON spec](https://tools.ietf.org/html/rfc7159) clearly states, "an
// implementation may set limits on the maximum depth of nesting".
//
// In comparison, as of February 2020, the Chromium web browser's JSON
// parser's maximum recursion depth is 200:
// https://source.chromium.org/chromium/chromium/src/+/3dece34cde622faa0daac07156c25d92c9897d1e:base/json/json_common.h;l=18
//
// Other languages and libraries' maximum depths (determined empirically)
// are listed at https://github.com/lovasoa/bad_json_parsers#results
stack : array[1024 / 32] base.u32,
)
pub func decoder.decode_tokens?(dst: base.token_writer, src: base.io_reader) {
var token_value : base.u64[..= 0xFF_FFFF]
var number_length : base.u32[..= 0x3FF]
var number_status : base.u32[..= 0x3]
var string_length : base.u32[..= 0xFFFE]
var whitespace_length : base.u32[..= 0xFFFE]
var depth : base.u32[..= 1024]
var stack_byte : base.u32[..= (1024 / 32) - 1]
var stack_bit : base.u32[..= 31]
var match : base.u32[..= 2]
var c : base.u8
var backslash : base.u8
var char : base.u8
var class : base.u8[..= 0x0F]
var uni4_ok : base.u8
var uni4_string : base.u64
var uni4_value : base.u32[..= 0xFFFF]
var uni4_high_surrogate : base.u32[..= 0x10_FC00]
var uni4_rollback : base.u32
// expect is a bitmask of what the next character class can be.
//
// expect_after_value is what to expect after seeing a value (a literal,
// number, string, array or object). For depth 0, this is ignored.
// Otherwise, it should be (EXPECT_CLOSE_FOO | EXPECT_COMMA), for some
// value of FOO.
var expect : base.u32
var expect_after_value : base.u32
expect = 0x0EB2 // EXPECT_VALUE
while.outer true {
while.goto_parsed_a_leaf_value true {
if args.dst.available() <= 0 {
yield? base."$short write"
continue.outer
}
// Consume whitespace.
whitespace_length = 0
c = 0
class = 0
while.ws true,
inv args.dst.available() > 0,
post args.src.available() > 0,
{
if args.src.available() <= 0 {
if whitespace_length > 0 {
args.dst.write_fast_token!(
value: 0,
length: whitespace_length as base.u64)
whitespace_length = 0
}
if args.src.is_closed() {
return "#bad input"
}
yield? base."$short read"
whitespace_length = 0
continue.outer
}
c = args.src.peek_u8()
class = lut_classes[c]
if class <> 0x00 { // 0x00 is CLASS_WHITESPACE.
break.ws
}
args.src.skip32_fast!(actual: 1, worst_case: 1)
if whitespace_length >= 0xFFFE {
args.dst.write_fast_token!(
value: 0,
length: 0xFFFF)
whitespace_length = 0
continue.outer
}
whitespace_length += 1
} endwhile.ws
// Emit whitespace.
if whitespace_length > 0 {
args.dst.write_fast_token!(
value: 0,
length: whitespace_length as base.u64)
whitespace_length = 0
if args.dst.available() <= 0 {
continue.outer
}
}
// Check expected character classes.
if 0 == (expect & ((1 as base.u32) << class)) {
return "#bad input"
}
// These assertions are redundant (the Wuffs compiler should
// already know these facts; deleting these assertions should still
// compile) but are listed explicitly to guard against future edits
// to the code above inadvertently invalidating these assertions.
assert args.dst.available() > 0
assert args.src.available() > 0
if class == 0x01 { // 0x01 is CLASS_STRING.
// -------- BEGIN parse strings.
// The leading '"' is filler.
args.dst.write_fast_token!(
value: 0,
length: 1)
args.src.skip32_fast!(actual: 1, worst_case: 1)
// If the string contents starts with an escape code, issue an
// incomplete zero-length string token (with
// WUFFS_BASE__TOKEN__VBC__STRING) before issuing a token with
// WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT.
while true {
if args.src.available() <= 0 {
if args.src.is_closed() {
return "#bad input"
}
yield? base."$short read"
continue
}
if args.src.peek_u8() <> 0x5C { // 0x5C is '\\'.
break
}
if args.dst.available() <= 0 {
yield? base."$short write"
continue
}
args.dst.write_fast_token!(
value: 0x20_7F31,
length: 0)
break
}
while.string_loop true {
if args.dst.available() <= 0 {
yield? base."$short write"
continue.string_loop
}
string_length = 0
while true,
pre args.dst.available() > 0,
{
if args.src.available() <= 0 {
if string_length > 0 {
args.dst.write_fast_token!(
value: 0x20_0221,
length: string_length as base.u64)
string_length = 0
}
if args.src.is_closed() {
return "#bad input"
}
yield? base."$short read"
string_length = 0
continue.string_loop
}
c = args.src.peek_u8()
char = lut_chars[c]
if char == 0x00 { // Non-special ASCII.
args.src.skip32_fast!(actual: 1, worst_case: 1)
if string_length >= 0xFFFE {
args.dst.write_fast_token!(
value: 0x20_0221,
length: 0xFFFF)
string_length = 0
continue.string_loop
}
string_length += 1
continue
} else if char == 0x01 { // '"'
args.dst.write_fast_token!(
value: 0x20_0220,
length: string_length as base.u64)
string_length = 0
break.string_loop
} else if char == 0x02 { // '\\'.
if string_length > 0 {
args.dst.write_fast_token!(
value: 0x20_0221,
length: string_length as base.u64)
string_length = 0
if args.dst.available() <= 0 {
continue.string_loop
}
}
assert args.dst.available() > 0
if args.src.available() < 2 {
if args.src.is_closed() {
return "#bad backslash-escape"
}
yield? base."$short read"
string_length = 0
char = 0
continue.string_loop
}
c = (args.src.peek_u16le() >> 8) as base.u8
backslash = lut_backslashes[c]
if backslash > 0 {
args.src.skip32_fast!(actual: 2, worst_case: 2)
args.dst.write_fast_token!(
value: 0x40_0000 | ((backslash & 0x7F) as base.u64),
length: 2)
continue.string_loop
} else if c == 0x75 { // 0x75 is 'u'.
// -------- BEGIN backslash-u.
if args.src.available() < 6 {
if args.src.is_closed() {
return "#bad backslash-escape"
}
yield? base."$short read"
string_length = 0
char = 0
continue.string_loop
}
uni4_string = args.src.peek_u48le_as_u64() >> 16
uni4_value = 0
uni4_ok = 0x80
c = lut_hexadecimal_digits[0xFF & (uni4_string >> 0)]
uni4_ok &= c
uni4_value |= ((c & 0x0F) as base.u32) << 12
c = lut_hexadecimal_digits[0xFF & (uni4_string >> 8)]
uni4_ok &= c
uni4_value |= ((c & 0x0F) as base.u32) << 8
c = lut_hexadecimal_digits[0xFF & (uni4_string >> 16)]
uni4_ok &= c
uni4_value |= ((c & 0x0F) as base.u32) << 4
c = lut_hexadecimal_digits[0xFF & (uni4_string >> 24)]
uni4_ok &= c
uni4_value |= ((c & 0x0F) as base.u32) << 0
if uni4_ok == 0 {
// It wasn't 4 hexadecimal digits. No-op
// (and fall through to "#bad
// backslash-escape").
} else if (uni4_value < 0xD800) or (0xDFFF < uni4_value) {
// Not a Unicode surrogate. We're good.
args.src.skip32_fast!(actual: 6, worst_case: 6)
args.dst.write_fast_token!(
value: 0x40_0000 | (uni4_value as base.u64),
length: 6)
continue.string_loop
} else if uni4_value >= 0xDC00 {
// Low surrogate. No-op (and fall through
// to "#bad backslash-escape").
} else {
// High surrogate, which needs to be
// followed by a "\\u1234" low surrogate.
// We've already peeked 6 bytes for the
// high surrogate. We need another 6.
if args.src.available() < 12 {
if args.src.is_closed() {
return "#bad backslash-escape"
}
yield? base."$short read"
string_length = 0
uni4_value = 0
char = 0
continue.string_loop
}
// Roll forward 4 bytes, so that calling
// peek_u64le will pick up the last 8.
args.src.skip32_fast!(actual: 4, worst_case: 4)
uni4_string = args.src.peek_u64le() >> 16
// Look for the low surrogate's "\\u".
if ((0xFF & (uni4_string >> 0)) <> 0x5C) or
((0xFF & (uni4_string >> 8)) <> 0x75) {
uni4_high_surrogate = 0
uni4_value = 0
uni4_ok = 0
} else {
uni4_high_surrogate =
0x1_0000 + ((uni4_value - 0xD800) << 10)
uni4_value = 0
uni4_ok = 0x80
uni4_string >>= 16
c = lut_hexadecimal_digits[0xFF & (uni4_string >> 0)]
uni4_ok &= c
uni4_value |= ((c & 0x0F) as base.u32) << 12
c = lut_hexadecimal_digits[0xFF & (uni4_string >> 8)]
uni4_ok &= c
uni4_value |= ((c & 0x0F) as base.u32) << 8
c = lut_hexadecimal_digits[0xFF & (uni4_string >> 16)]
uni4_ok &= c
uni4_value |= ((c & 0x0F) as base.u32) << 4
c = lut_hexadecimal_digits[0xFF & (uni4_string >> 24)]
uni4_ok &= c
uni4_value |= ((c & 0x0F) as base.u32) << 0
}
if (uni4_ok <> 0) and
(0xDC00 <= uni4_value) and (uni4_value <= 0xDFFF) {
// Emit a single token for the surrogate pair.
uni4_value -= 0xDC00
args.src.skip32_fast!(actual: 8, worst_case: 8)
args.dst.write_fast_token!(
value: 0x40_0000 | ((uni4_high_surrogate + uni4_value) as base.u64),
length: 12)
continue.string_loop
} else {
// Roll back the 4 bytes, then fall
// through to "#invalid
// backslash-escape".
uni4_rollback = 4
while uni4_rollback > 0 {
uni4_rollback -= 1
if args.src.can_undo_byte() {
args.src.undo_byte!()
} else {
return "#internal error: inconsistent I/O"
}
}
}
}
// -------- END backslash-u.
}
return "#bad backslash-escape"
} else if char <= 0x10 {
// TODO: reject invalid UTF-8.
args.src.skip32_fast!(actual: 1, worst_case: 1)
if string_length >= 0xFFFE {
args.dst.write_fast_token!(
value: 0x20_0221,
length: 0xFFFF)
string_length = 0
continue.string_loop
}
string_length += 1
continue
}
if string_length > 0 {
args.dst.write_fast_token!(
value: 0x20_0221,
length: string_length as base.u64)
string_length = 0
}
if char == 0x80 {
return "#bad C0 control code"
}
return "#bad UTF-8"
}
} endwhile.string_loop
// The trailing '"' is filler.
while true {
if args.src.available() <= 0 {
if args.src.is_closed() {
return "#bad input"
}
yield? base."$short read"
continue
}
if args.dst.available() <= 0 {
yield? base."$short write"
continue
}
args.src.skip32_fast!(actual: 1, worst_case: 1)
args.dst.write_fast_token!(
value: 0,
length: 1)
break
}
// As above, expect must have contained EXPECT_STRING. If it
// didn't also contain EXPECT_NUMBER then we were parsing an
// object key and the next token should be ':'.
if 0 == (expect & 0x0010) { // 0x0010 is EXPECT_NUMBER.
expect = 0x0008 // 0x0008 is EXPECT_COLON.
continue.outer
}
break.goto_parsed_a_leaf_value
// -------- END parse strings.
} else if class == 0x02 { // 0x02 is CLASS_COMMA.
args.src.skip32_fast!(actual: 1, worst_case: 1)
// The ',' is filler.
args.dst.write_fast_token!(
value: 0,
length: 1)
// What's valid after a comma depends on whether or not we're
// in an array or an object.
if 0 == (expect & 0x0100) { // 0x0100 is EXPECT_CLOSE_SQUARE_BRACKET
expect = 0x0002 // 0x0002 is EXPECT_STRING.
} else {
expect = 0x0EB2 // 0x0EB2 is EXPECT_VALUE.
}
continue.outer
} else if class == 0x03 { // 0x03 is CLASS_COLON.
args.src.skip32_fast!(actual: 1, worst_case: 1)
// The ':' is filler.
args.dst.write_fast_token!(
value: 0,
length: 1)
expect = 0x0EB2 // 0x0EB2 is EXPECT_VALUE.
continue.outer
} else if class == 0x04 { // 0x04 is CLASS_NUMBER.
// -------- BEGIN parse numbers.
while true,
pre args.dst.available() > 0,
{
number_length = this.decode_number!(src: args.src)
number_status = number_length >> 8
token_value = 0x60_0004
if (number_length & 0x80) <> 0 {
token_value = 0x60_0002
}
number_length = number_length & 0x7F
if number_status == 0 {
args.dst.write_fast_token!(
value: token_value,
length: number_length as base.u64)
break
}
while number_length > 0 {
number_length -= 1
if args.src.can_undo_byte() {
args.src.undo_byte!()
} else {
return "#internal error: inconsistent I/O"
}
}
if number_status == 1 {
return "#bad input"
} else if number_status == 2 {
return "#unsupported number length"
} else {
yield? base."$short read"
while args.dst.available() <= 0,
post args.dst.available() > 0,
{
yield? base."$short write"
}
}
}
break.goto_parsed_a_leaf_value
// -------- END parse numbers.
} else if class == 0x05 { // 0x05 is CLASS_OPEN_CURLY_BRACE.
token_value = 0x80_4011
if depth == 0 {
// No-op.
} else if 0 <> (expect_after_value & 0x0040) { // 0x0040 is EXPECT_CLOSE_CURLY_BRACE.
token_value = 0x80_4041
} else {
token_value = 0x80_4021
}
if depth >= 1024 {
return "#unsupported recursion depth"
}
stack_byte = depth / 32
stack_bit = depth & 31
this.stack[stack_byte] |= (1 as base.u32) << stack_bit
depth += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
args.dst.write_fast_token!(
value: token_value,
length: 1)
expect = 0x0042 // 0x0042 is (EXPECT_CLOSE_CURLY_BRACE | EXPECT_STRING).
expect_after_value = 0x0044 // 0x0044 is (EXPECT_CURLY_CLOSE_BRACE | EXPECT_COMMA).
continue.outer
} else if class == 0x06 { // 0x06 is CLASS_CLOSE_CURLY_BRACE.
args.src.skip32_fast!(actual: 1, worst_case: 1)
if depth <= 1 {
args.dst.write_fast_token!(
value: 0x80_1042,
length: 1)
break.outer
}
depth -= 1
stack_byte = (depth - 1) / 32
stack_bit = (depth - 1) & 31
if 0 == (this.stack[stack_byte] & ((1 as base.u32) << stack_bit)) {
args.dst.write_fast_token!(
value: 0x80_2042,
length: 1)
// 0x0104 is (EXPECT_SQUARE_CLOSE_BRACKET | EXPECT_COMMA).
expect = 0x0104
expect_after_value = 0x0104
} else {
args.dst.write_fast_token!(
value: 0x80_4042,
length: 1)
// 0x0044 is (EXPECT_CURLY_CLOSE_BRACE | EXPECT_COMMA).
expect = 0x0044
expect_after_value = 0x0044
}
continue.outer
} else if class == 0x07 { // 0x07 is CLASS_OPEN_SQUARE_BRACKET.
token_value = 0x80_2011
if depth == 0 {
// No-op.
} else if 0 <> (expect_after_value & 0x0040) { // 0x0040 is EXPECT_CLOSE_CURLY_BRACE.
token_value = 0x80_2041
} else {
token_value = 0x80_2021
}
if depth >= 1024 {
return "#unsupported recursion depth"
}
stack_byte = depth / 32
stack_bit = depth & 31
this.stack[stack_byte] &= 0xFFFF_FFFF ^ ((1 as base.u32) << stack_bit)
depth += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
args.dst.write_fast_token!(
value: token_value,
length: 1)
expect = 0x0FB2 // 0x0FB2 is (EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_VALUE).
expect_after_value = 0x0104 // 0x0104 is (EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_COMMA).
continue.outer
} else if class == 0x08 { // 0x08 is CLASS_CLOSE_SQUARE_BRACKET.
args.src.skip32_fast!(actual: 1, worst_case: 1)
if depth <= 1 {
args.dst.write_fast_token!(
value: 0x80_1022,
length: 1)
break.outer
}
depth -= 1
stack_byte = (depth - 1) / 32
stack_bit = (depth - 1) & 31
if 0 == (this.stack[stack_byte] & ((1 as base.u32) << stack_bit)) {
args.dst.write_fast_token!(
value: 0x80_2022,
length: 1)
// 0x0104 is (EXPECT_CLOSE_SQUARE_BRACKET | EXPECT_COMMA).
expect = 0x0104
expect_after_value = 0x0104
} else {
args.dst.write_fast_token!(
value: 0x80_4022,
length: 1)
// 0x0044 is (EXPECT_CLOSE_CURLY_BRACE | EXPECT_COMMA).
expect = 0x0044
expect_after_value = 0x0044
}
continue.outer
} else if class == 0x09 { // 0x09 is CLASS_FALSE.
match = args.src.match7(a: 0x6573_6C61_6605) // 5 bytes "false".
if match == 0 {
args.dst.write_fast_token!(
value: 0x60_0401,
length: 5)
if args.src.available() < 5 {
return "#internal error: inconsistent I/O"
}
args.src.skip32_fast!(actual: 5, worst_case: 5)
break.goto_parsed_a_leaf_value
} else if match == 1 {
yield? base."$short read"
continue.outer
}
} else if class == 0x0A { // 0x0A is CLASS_TRUE.
match = args.src.match7(a: 0x65_7572_7404) // 4 bytes "true".
if match == 0 {
args.dst.write_fast_token!(
value: 0x60_0801,
length: 4)
if args.src.available() < 4 {
return "#internal error: inconsistent I/O"
}
args.src.skip32_fast!(actual: 4, worst_case: 4)
break.goto_parsed_a_leaf_value
} else if match == 1 {
yield? base."$short read"
continue.outer
}
} else if class == 0x0B { // 0x0B is CLASS_NULL.
match = args.src.match7(a: 0x6C_6C75_6E04) // 4 bytes "null".
if match == 0 {
args.dst.write_fast_token!(
value: 0x60_0201,
length: 4)
if args.src.available() < 4 {
return "#internal error: inconsistent I/O"
}
args.src.skip32_fast!(actual: 4, worst_case: 4)
break.goto_parsed_a_leaf_value
} else if match == 1 {
yield? base."$short read"
continue.outer
}
}
return "#bad input"
} endwhile.goto_parsed_a_leaf_value
// We've just parsed a leaf (non-container) value: literal (null,
// false, true), number or string.
if depth == 0 {
break.outer
}
expect = expect_after_value
} endwhile.outer
}
pri func decoder.decode_number!(src: base.io_reader) base.u32[..= 0x3FF] {
var c : base.u8
var n : base.u32[..= 0x3FF]
var floating_point : base.u32[..= 0x80]
while.goto_done true {
n = 0
// Peek.
if args.src.available() <= 0 {
if not args.src.is_closed() {
n |= 0x300
}
break.goto_done
}
c = args.src.peek_u8()
// Scan the optional minus sign.
if c <> 0x2D { // 0x2D is '-'.
assert args.src.available() > 0
assert n <= 1
} else {
n += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
// Peek.
if args.src.available() <= 0 {
if not args.src.is_closed() {
n |= 0x300
}
n |= 0x100 // A '-' without digits is invalid.
break.goto_done
}
c = args.src.peek_u8()
assert args.src.available() > 0
assert n <= 1
}
// Scan the opening digits.
if c == 0x30 { // 0x30 is '0'.
n += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
assert n <= 0xFD
} else {
n = this.decode_digits!(src: args.src, n: n)
if n > 0xFD {
break.goto_done
}
assert n <= 0xFD
}
// Peek.
if args.src.available() <= 0 {
if not args.src.is_closed() {
n |= 0x300
}
break.goto_done
}
c = args.src.peek_u8()
// Scan the optional fraction.
if c <> 0x2E { // 0x2E is '.'.
assert args.src.available() > 0
assert n <= 0xFD
} else {
floating_point = 0x80
n += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
n = this.decode_digits!(src: args.src, n: n)
if n > 0xFD {
break.goto_done
}
// Peek.
if args.src.available() <= 0 {
if not args.src.is_closed() {
n |= 0x300
}
break.goto_done
}
c = args.src.peek_u8()
assert args.src.available() > 0
assert n <= 0xFD
}
// Scan the optional 'E' or 'e'.
if (c <> 0x45) and (c <> 0x65) { // 0x45 and 0x65 are 'E' and 'e'.
break.goto_done
}
floating_point = 0x80
n += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
assert n <= 0xFE
// Peek.
if args.src.available() <= 0 {
if not args.src.is_closed() {
n |= 0x300
}
n |= 0x100 // An 'E' or 'e' without digits is invalid.
break.goto_done
}
c = args.src.peek_u8()
// Scan the optional '+' or '-'.
if (c <> 0x2B) and (c <> 0x2D) { // 0x2B and 0x2D are '+' and '-'.
assert n <= 255
} else {
n += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
assert n <= 255
}
// Scan the exponent digits.
n = this.decode_digits!(src: args.src, n: n)
break.goto_done
} endwhile.goto_done
return n | floating_point
}
pri func decoder.decode_digits!(src: base.io_reader, n: base.u32[..= 0xFF]) base.u32[..= 0x3FF] {
var c : base.u8
var n : base.u32[..= 0x3FF]
n = args.n
while true {
if args.src.available() <= 0 {
if not args.src.is_closed() {
n |= 0x300
}
break
}
c = args.src.peek_u8()
if (c < 0x30) or (0x39 < c) {
break
}
// Cap the maximum supported number length at an arbitrary value, 99.
// The caller's src.data.len should therefore be at least 100.
//
// An example of a JSON number that is 81 bytes long is:
// https://github.com/nst/JSONTestSuite/blob/master/test_parsing/y_number_double_close_to_zero.json
//
// Note that 99 (in hex, 0x63) is less than 0x80, so we can use 0x80 as
// a flag bit in func decoder.decode_number.
if n >= 99 {
n |= 0x200
break
}
n += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
}
if n == args.n {
n |= 0x100
}
return n
}