blob: c65f33a0bff24f7c79b1db2cd9b6559a0dd8cd31 [file] [log] [blame]
// Copyright 2020 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// TODO: "decoder implements base.token_decoder".
pub status "#bad input"
pub status "#unsupported recursion depth"
pub struct decoder?(
)(
// stack is conceptually an array of bits, implemented as an array of u32.
// The N'th bit being 0 or 1 means that we're in an array or object, where
// N is the recursion depth.
//
// Parsing JSON involves recursion: containers (arrays and objects) can
// hold other containers. As child elements are completed, the parser needs
// to remember 1 bit of state per recursion depth: whether the parent
// container was an array or an object. When continuing to parse the
// parent's elements, `, "key": value` is only valid for objects.
//
// Note that we explicitly track our own stack and depth. We do not use the
// call stack to hold this state and the decoder.decode_tokens function is
// not recursive per se.
//
// Wuffs code does not have the capability to dynamically allocate memory,
// so the maximum depth is hard-coded at compile time. In this case, the
// maximum is 1024 (stack is 1024 bits or 128 bytes).
//
// The [JSON spec](https://tools.ietf.org/html/rfc7159) clearly states, "an
// implementation may set limits on the maximum depth of nesting".
//
// In comparison, as of February 2020, the Chromium web browser's JSON
// parser's maximum recursion depth is 200:
// https://source.chromium.org/chromium/chromium/src/+/3dece34cde622faa0daac07156c25d92c9897d1e:base/json/json_common.h;l=18
//
// Other languages and libraries' maximum depths (determined empirically)
// are listed at https://github.com/lovasoa/bad_json_parsers#results
stack : array[1024 / 32] base.u32,
)
pub func decoder.decode_tokens?(dst: base.token_writer, src: base.io_reader) {
var string_length : base.u32[..= 0xFFFE]
var whitespace_length : base.u32[..= 0xFFFE]
var depth : base.u32[..= 1024]
var stack_byte : base.u32[..= (1024 / 32) - 1]
var stack_bit : base.u32[..= 31]
var c : base.u8
// expect is a bitmask of what the next token can be:
// - 0x01 EXPECT_NON_STRING_VALUE.
// - 0x02 EXPECT_STRING_VALUE.
// - 0x04 EXPECT_COMMA ','.
// - 0x08 EXPECT_COLON ':'.
// - 0x10 EXPECT_CLOSE_BRACKET ']'.
// - 0x20 EXPECT_CLOSE_BRACE '}'.
//
// EXPECT_VALUE is also defined to be 0x03, equivalent to
// (EXPECT_NON_STRING_VALUE | EXPECT_STRING_VALUE).
//
// "Non-string value" includes numbers, literals (false, true, null),
// arrays ("[etc]") and objects ("{etc}").
//
// "String value" includes "this" and "th\u0061t".
var expect : base.u8
// expect_after_value is what to expect after seeing a value. For depth 0,
// this is ignored. Otherwise, it should be (EXPECT_CLOSE_FOO |
// EXPECT_COMMA), for some value of FOO.
var expect_after_value : base.u8
expect = 0x03 // EXPECT_VALUE
while.outer true {
if args.dst.available() <= 0 {
yield? base."$short write"
continue.outer
}
// Consume whitespace.
whitespace_length = 0
c = 0
while.ws true,
inv args.dst.available() > 0,
post args.src.available() > 0,
{
if args.src.available() <= 0 {
if whitespace_length > 0 {
args.dst.write_fast_token!(value: 0, length: whitespace_length as base.u64)
whitespace_length = 0
}
yield? base."$short read"
whitespace_length = 0
continue.outer
}
c = args.src.peek_u8()
if c > 0x20 { // 0x20 is ' '.
break.ws
} else if (c <> 0x20) and (c <> 0x0A) and (c <> 0x09) and (c <> 0x0D) {
if whitespace_length > 0 {
args.dst.write_fast_token!(value: 0, length: whitespace_length as base.u64)
}
return "#bad input"
}
args.src.skip32_fast!(actual: 1, worst_case: 1)
if whitespace_length >= 0xFFFE {
args.dst.write_fast_token!(value: 0, length: 0xFFFF)
whitespace_length = 0
continue.outer
}
whitespace_length += 1
}
// Emit whitespace.
if whitespace_length > 0 {
args.dst.write_fast_token!(value: 0, length: whitespace_length as base.u64)
whitespace_length = 0
if args.dst.available() <= 0 {
continue.outer
}
}
// These assertions are redundant (the Wuffs compiler should already
// know these facts; deleting these assertions should still compile)
// but are listed explicitly to help guard against future edits to the
// code above inadvertently invalidating these assertions.
assert args.dst.available() > 0
assert args.src.available() > 0
if c <= 0x39 { // 0x39 is '9'.
if c == 0x22 { // 0x22 is '"'.
// -------- BEGIN parse strings.
if 0 == (expect & 0x02) { // 0x02 is EXPECT_STRING_VALUE.
return "#bad input"
}
// The leading '"' is filler.
args.dst.write_fast_token!(value: 0, length: 1)
args.src.skip32_fast!(actual: 1, worst_case: 1)
while.string_loop true {
if args.dst.available() <= 0 {
yield? base."$short write"
continue.string_loop
}
string_length = 0
while true,
pre args.dst.available() > 0,
{
if args.src.available() <= 0 {
if string_length > 0 {
args.dst.write_fast_token!(value: 0x60_0001, length: string_length as base.u64)
string_length = 0
}
continue.string_loop
}
c = args.src.peek_u8()
args.src.skip32_fast!(actual: 1, worst_case: 1)
if c == 0x22 { // 0x22 is '"'.
args.dst.write_fast_token!(value: 0x60_0000, length: string_length as base.u64)
string_length = 0
break.string_loop
} else if c == 0x5C { // 0x5C is '\\'.
// TODO: escapes.
return "#bad input"
} else if string_length >= 0xFFFE {
args.dst.write_fast_token!(value: 0x60_0001, length: 0xFFFF)
string_length = 0
continue.string_loop
}
string_length += 1
}
}
// The trailing '"' is filler.
while true {
if args.dst.available() > 0 {
args.dst.write_fast_token!(value: 0, length: 1)
break
}
yield? base."$short write"
}
// As above, expect must have contained EXPECT_STRING_VALUE. If
// it didn't also contain EXPECT_NON_STRING_VALUE then we were
// parsing an object key and the next token should be ':'.
// Otherwise, fall below to "We've just parsed a value".
if 0 == (expect & 0x01) { // 0x01 is EXPECT_NON_STRING_VALUE.
expect = 0x08 // 0x08 is EXPECT_COLON.
continue.outer
}
// -------- END parse strings.
} else if c == 0x2C { // 0x2C is ','.
if 0 == (expect & 0x04) { // 0x01 is EXPECT_COMMA.
return "#bad input"
}
args.src.skip32_fast!(actual: 1, worst_case: 1)
args.dst.write_fast_token!(value: 0, length: 1) // The ',' is filler.
// What's valid after a comma depends on whether or not we're
// in an array or an object.
if 0 == (expect & 0x10) { // 0x10 is EXPECT_CLOSE_BRACKET
expect = 0x02 // 0x02 is EXPECT_STRING_VALUE.
} else {
expect = 0x03 // 0x03 is EXPECT_VALUE.
}
continue.outer
} else {
// TODO: numbers.
return "#bad input"
}
} else if c == 0x3A { // 0x3A is ':'.
if 0 == (expect & 0x08) { // 0x08 is EXPECT_COLON.
return "#bad input"
}
args.src.skip32_fast!(actual: 1, worst_case: 1)
args.dst.write_fast_token!(value: 0, length: 1) // The ':' is filler.
expect = 0x03 // 0x03 is EXPECT_VALUE.
continue.outer
} else if c == 0x5B { // 0x5B is '['.
if 0 == (expect & 0x01) { // 0x01 is EXPECT_NON_STRING_VALUE.
return "#bad input"
}
if depth >= 1024 {
return "#unsupported recursion depth"
}
stack_byte = depth / 32
stack_bit = depth & 31
this.stack[stack_byte] &= 0 ^ ((1 as base.u32) << stack_bit)
depth += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
args.dst.write_fast_token!(value: 0x20_0011, length: 1)
expect = 0x13 // 0x13 is (EXPECT_CLOSE_BRACKET | EXPECT_VALUE).
expect_after_value = 0x14 // 0x14 is (EXPECT_CLOSE_BRACKET | EXPECT_COMMA).
continue.outer
} else if c == 0x5D { // 0x5D is ']'.
if 0 == (expect & 0x10) { // 0x10 is EXPECT_CLOSE_BRACKET
return "#bad input"
}
args.src.skip32_fast!(actual: 1, worst_case: 1)
if depth <= 1 {
args.dst.write_fast_token!(value: 0x20_0012, length: 1)
break.outer
}
depth -= 1
stack_byte = (depth - 1) / 32
stack_bit = (depth - 1) & 31
if 0 == (this.stack[stack_byte] & ((1 as base.u32) << stack_bit)) {
args.dst.write_fast_token!(value: 0x20_1012, length: 1)
// 0x14 is (EXPECT_CLOSE_BRACKET | EXPECT_COMMA).
expect = 0x14
expect_after_value = 0x14
} else {
args.dst.write_fast_token!(value: 0x20_2012, length: 1)
// 0x24 is (EXPECT_CLOSE_BRACE | EXPECT_COMMA).
expect = 0x24
expect_after_value = 0x24
}
continue.outer
} else if c == 0x7B { // 0x7B is '{'.
if 0 == (expect & 0x01) { // 0x01 is EXPECT_NON_STRING_VALUE.
return "#bad input"
}
if depth >= 1024 {
return "#unsupported recursion depth"
}
stack_byte = depth / 32
stack_bit = depth & 31
this.stack[stack_byte] |= (1 as base.u32) << stack_bit
depth += 1
args.src.skip32_fast!(actual: 1, worst_case: 1)
args.dst.write_fast_token!(value: 0x20_0021, length: 1)
expect = 0x22 // 0x22 is (EXPECT_CLOSE_BRACE | EXPECT_STRING_VALUE).
expect_after_value = 0x24 // 0x24 is (EXPECT_CLOSE_BRACE | EXPECT_COMMA).
continue.outer
} else if c == 0x7D { // 0x7D is '}'.
if 0 == (expect & 0x20) { // 0x20 is EXPECT_CLOSE_BRACE.
return "#bad input"
}
args.src.skip32_fast!(actual: 1, worst_case: 1)
if depth <= 1 {
args.dst.write_fast_token!(value: 0x20_0022, length: 1)
break.outer
}
depth -= 1
stack_byte = (depth - 1) / 32
stack_bit = (depth - 1) & 31
if 0 == (this.stack[stack_byte] & ((1 as base.u32) << stack_bit)) {
args.dst.write_fast_token!(value: 0x20_1022, length: 1)
// 0x14 is (EXPECT_CLOSE_BRACKET | EXPECT_COMMA).
expect = 0x14
expect_after_value = 0x14
} else {
args.dst.write_fast_token!(value: 0x20_2022, length: 1)
// 0x24 is (EXPECT_CLOSE_BRACE | EXPECT_COMMA).
expect = 0x24
expect_after_value = 0x24
}
continue.outer
} else {
// TODO: literals (false, true, null).
return "#bad input"
}
// We've just parsed a value.
if depth == 0 {
break.outer
}
expect = expect_after_value
}
}