| // Copyright 2020 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // TODO: "decoder implements base.token_decoder". |
| |
| pub status "#bad input" |
| pub status "#unsupported recursion depth" |
| |
| pub struct decoder?( |
| )( |
| // stack is conceptually an array of bits, implemented as an array of u32. |
| // The N'th bit being 0 or 1 means that we're in an array or object, where |
| // N is the recursion depth. |
| // |
| // Parsing JSON involves recursion: containers (arrays and objects) can |
| // hold other containers. As child elements are completed, the parser needs |
| // to remember 1 bit of state per recursion depth: whether the parent |
| // container was an array or an object. When continuing to parse the |
| // parent's elements, `, "key": value` is only valid for objects. |
| // |
| // Note that we explicitly track our own stack and depth. We do not use the |
| // call stack to hold this state and the decoder.decode_tokens function is |
| // not recursive per se. |
| // |
| // Wuffs code does not have the capability to dynamically allocate memory, |
| // so the maximum depth is hard-coded at compile time. In this case, the |
| // maximum is 1024 (stack is 1024 bits or 128 bytes). |
| // |
| // The [JSON spec](https://tools.ietf.org/html/rfc7159) clearly states, "an |
| // implementation may set limits on the maximum depth of nesting". |
| // |
| // In comparison, as of February 2020, the Chromium web browser's JSON |
| // parser's maximum recursion depth is 200: |
| // https://source.chromium.org/chromium/chromium/src/+/3dece34cde622faa0daac07156c25d92c9897d1e:base/json/json_common.h;l=18 |
| // |
| // Other languages and libraries' maximum depths (determined empirically) |
| // are listed at https://github.com/lovasoa/bad_json_parsers#results |
| stack : array[1024 / 32] base.u32, |
| ) |
| |
| pub func decoder.decode_tokens?(dst: base.token_writer, src: base.io_reader) { |
| var string_length : base.u32[..= 0xFFFE] |
| var whitespace_length : base.u32[..= 0xFFFE] |
| var depth : base.u32[..= 1024] |
| var stack_byte : base.u32[..= (1024 / 32) - 1] |
| var stack_bit : base.u32[..= 31] |
| var c : base.u8 |
| |
| // expect is a bitmask of what the next token can be: |
| // - 0x01 EXPECT_NON_STRING_VALUE. |
| // - 0x02 EXPECT_STRING_VALUE. |
| // - 0x04 EXPECT_COMMA ','. |
| // - 0x08 EXPECT_COLON ':'. |
| // - 0x10 EXPECT_CLOSE_BRACKET ']'. |
| // - 0x20 EXPECT_CLOSE_BRACE '}'. |
| // |
| // EXPECT_VALUE is also defined to be 0x03, equivalent to |
| // (EXPECT_NON_STRING_VALUE | EXPECT_STRING_VALUE). |
| // |
| // "Non-string value" includes numbers, literals (false, true, null), |
| // arrays ("[etc]") and objects ("{etc}"). |
| // |
| // "String value" includes "this" and "th\u0061t". |
| var expect : base.u8 |
| |
| // expect_after_value is what to expect after seeing a value. For depth 0, |
| // this is ignored. Otherwise, it should be (EXPECT_CLOSE_FOO | |
| // EXPECT_COMMA), for some value of FOO. |
| var expect_after_value : base.u8 |
| |
| expect = 0x03 // EXPECT_VALUE |
| |
| while.outer true { |
| if args.dst.available() <= 0 { |
| yield? base."$short write" |
| continue.outer |
| } |
| |
| // Consume whitespace. |
| whitespace_length = 0 |
| c = 0 |
| while.ws true, |
| inv args.dst.available() > 0, |
| post args.src.available() > 0, |
| { |
| if args.src.available() <= 0 { |
| if whitespace_length > 0 { |
| args.dst.write_fast_token!(value: 0, length: whitespace_length as base.u64) |
| whitespace_length = 0 |
| } |
| yield? base."$short read" |
| whitespace_length = 0 |
| continue.outer |
| } |
| |
| c = args.src.peek_u8() |
| if c > 0x20 { // 0x20 is ' '. |
| break.ws |
| } else if (c <> 0x20) and (c <> 0x0A) and (c <> 0x09) and (c <> 0x0D) { |
| if whitespace_length > 0 { |
| args.dst.write_fast_token!(value: 0, length: whitespace_length as base.u64) |
| } |
| return "#bad input" |
| } |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| |
| if whitespace_length >= 0xFFFE { |
| args.dst.write_fast_token!(value: 0, length: 0xFFFF) |
| whitespace_length = 0 |
| continue.outer |
| } |
| whitespace_length += 1 |
| } |
| |
| // Emit whitespace. |
| if whitespace_length > 0 { |
| args.dst.write_fast_token!(value: 0, length: whitespace_length as base.u64) |
| whitespace_length = 0 |
| if args.dst.available() <= 0 { |
| continue.outer |
| } |
| } |
| |
| // These assertions are redundant (the Wuffs compiler should already |
| // know these facts; deleting these assertions should still compile) |
| // but are listed explicitly to help guard against future edits to the |
| // code above inadvertently invalidating these assertions. |
| assert args.dst.available() > 0 |
| assert args.src.available() > 0 |
| |
| if c <= 0x39 { // 0x39 is '9'. |
| if c == 0x22 { // 0x22 is '"'. |
| // -------- BEGIN parse strings. |
| if 0 == (expect & 0x02) { // 0x02 is EXPECT_STRING_VALUE. |
| return "#bad input" |
| } |
| |
| // The leading '"' is filler. |
| args.dst.write_fast_token!(value: 0, length: 1) |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| |
| while.string_loop true { |
| if args.dst.available() <= 0 { |
| yield? base."$short write" |
| continue.string_loop |
| } |
| |
| string_length = 0 |
| while true, |
| pre args.dst.available() > 0, |
| { |
| if args.src.available() <= 0 { |
| if string_length > 0 { |
| args.dst.write_fast_token!(value: 0x60_0001, length: string_length as base.u64) |
| string_length = 0 |
| } |
| continue.string_loop |
| } |
| c = args.src.peek_u8() |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| |
| if c == 0x22 { // 0x22 is '"'. |
| args.dst.write_fast_token!(value: 0x60_0000, length: string_length as base.u64) |
| string_length = 0 |
| break.string_loop |
| |
| } else if c == 0x5C { // 0x5C is '\\'. |
| // TODO: escapes. |
| return "#bad input" |
| |
| } else if string_length >= 0xFFFE { |
| args.dst.write_fast_token!(value: 0x60_0001, length: 0xFFFF) |
| string_length = 0 |
| continue.string_loop |
| } |
| |
| string_length += 1 |
| } |
| } |
| |
| // The trailing '"' is filler. |
| while true { |
| if args.dst.available() > 0 { |
| args.dst.write_fast_token!(value: 0, length: 1) |
| break |
| } |
| yield? base."$short write" |
| } |
| |
| // As above, expect must have contained EXPECT_STRING_VALUE. If |
| // it didn't also contain EXPECT_NON_STRING_VALUE then we were |
| // parsing an object key and the next token should be ':'. |
| // Otherwise, fall below to "We've just parsed a value". |
| if 0 == (expect & 0x01) { // 0x01 is EXPECT_NON_STRING_VALUE. |
| expect = 0x08 // 0x08 is EXPECT_COLON. |
| continue.outer |
| } |
| // -------- END parse strings. |
| |
| } else if c == 0x2C { // 0x2C is ','. |
| if 0 == (expect & 0x04) { // 0x01 is EXPECT_COMMA. |
| return "#bad input" |
| } |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| args.dst.write_fast_token!(value: 0, length: 1) // The ',' is filler. |
| // What's valid after a comma depends on whether or not we're |
| // in an array or an object. |
| if 0 == (expect & 0x10) { // 0x10 is EXPECT_CLOSE_BRACKET |
| expect = 0x02 // 0x02 is EXPECT_STRING_VALUE. |
| } else { |
| expect = 0x03 // 0x03 is EXPECT_VALUE. |
| } |
| continue.outer |
| |
| } else { |
| // TODO: numbers. |
| return "#bad input" |
| } |
| |
| } else if c == 0x3A { // 0x3A is ':'. |
| if 0 == (expect & 0x08) { // 0x08 is EXPECT_COLON. |
| return "#bad input" |
| } |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| args.dst.write_fast_token!(value: 0, length: 1) // The ':' is filler. |
| expect = 0x03 // 0x03 is EXPECT_VALUE. |
| continue.outer |
| |
| } else if c == 0x5B { // 0x5B is '['. |
| if 0 == (expect & 0x01) { // 0x01 is EXPECT_NON_STRING_VALUE. |
| return "#bad input" |
| } |
| if depth >= 1024 { |
| return "#unsupported recursion depth" |
| } |
| stack_byte = depth / 32 |
| stack_bit = depth & 31 |
| this.stack[stack_byte] &= 0 ^ ((1 as base.u32) << stack_bit) |
| depth += 1 |
| |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| args.dst.write_fast_token!(value: 0x20_0011, length: 1) |
| expect = 0x13 // 0x13 is (EXPECT_CLOSE_BRACKET | EXPECT_VALUE). |
| expect_after_value = 0x14 // 0x14 is (EXPECT_CLOSE_BRACKET | EXPECT_COMMA). |
| continue.outer |
| |
| } else if c == 0x5D { // 0x5D is ']'. |
| if 0 == (expect & 0x10) { // 0x10 is EXPECT_CLOSE_BRACKET |
| return "#bad input" |
| } |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| if depth <= 1 { |
| args.dst.write_fast_token!(value: 0x20_0012, length: 1) |
| break.outer |
| } |
| depth -= 1 |
| stack_byte = (depth - 1) / 32 |
| stack_bit = (depth - 1) & 31 |
| if 0 == (this.stack[stack_byte] & ((1 as base.u32) << stack_bit)) { |
| args.dst.write_fast_token!(value: 0x20_1012, length: 1) |
| // 0x14 is (EXPECT_CLOSE_BRACKET | EXPECT_COMMA). |
| expect = 0x14 |
| expect_after_value = 0x14 |
| } else { |
| args.dst.write_fast_token!(value: 0x20_2012, length: 1) |
| // 0x24 is (EXPECT_CLOSE_BRACE | EXPECT_COMMA). |
| expect = 0x24 |
| expect_after_value = 0x24 |
| } |
| continue.outer |
| |
| } else if c == 0x7B { // 0x7B is '{'. |
| if 0 == (expect & 0x01) { // 0x01 is EXPECT_NON_STRING_VALUE. |
| return "#bad input" |
| } |
| if depth >= 1024 { |
| return "#unsupported recursion depth" |
| } |
| stack_byte = depth / 32 |
| stack_bit = depth & 31 |
| this.stack[stack_byte] |= (1 as base.u32) << stack_bit |
| depth += 1 |
| |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| args.dst.write_fast_token!(value: 0x20_0021, length: 1) |
| expect = 0x22 // 0x22 is (EXPECT_CLOSE_BRACE | EXPECT_STRING_VALUE). |
| expect_after_value = 0x24 // 0x24 is (EXPECT_CLOSE_BRACE | EXPECT_COMMA). |
| continue.outer |
| |
| } else if c == 0x7D { // 0x7D is '}'. |
| if 0 == (expect & 0x20) { // 0x20 is EXPECT_CLOSE_BRACE. |
| return "#bad input" |
| } |
| args.src.skip32_fast!(actual: 1, worst_case: 1) |
| if depth <= 1 { |
| args.dst.write_fast_token!(value: 0x20_0022, length: 1) |
| break.outer |
| } |
| depth -= 1 |
| stack_byte = (depth - 1) / 32 |
| stack_bit = (depth - 1) & 31 |
| if 0 == (this.stack[stack_byte] & ((1 as base.u32) << stack_bit)) { |
| args.dst.write_fast_token!(value: 0x20_1022, length: 1) |
| // 0x14 is (EXPECT_CLOSE_BRACKET | EXPECT_COMMA). |
| expect = 0x14 |
| expect_after_value = 0x14 |
| } else { |
| args.dst.write_fast_token!(value: 0x20_2022, length: 1) |
| // 0x24 is (EXPECT_CLOSE_BRACE | EXPECT_COMMA). |
| expect = 0x24 |
| expect_after_value = 0x24 |
| } |
| continue.outer |
| |
| } else { |
| // TODO: literals (false, true, null). |
| return "#bad input" |
| } |
| |
| // We've just parsed a value. |
| if depth == 0 { |
| break.outer |
| } |
| expect = expect_after_value |
| } |
| } |