Add an "extended token" concept

commit: 462f86675d89db5b6f25793c31e5e466943eb4ae [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Wed Apr 01 23:01:51 2020 +1100
committer: Nigel Tao <nigeltao@golang.org> Thu Apr 02 11:20:45 2020 +1100
tree: a66fdc2c35e50c8865d6984661b67318d2a5cdbc
parent: 7d684f94ae1fc6444202a7bfa7fb778a293a9c89 [diff]
diff --git a/example/jsonfindptrs/jsonfindptrs.cc b/example/jsonfindptrs/jsonfindptrs.cc
index 325661e..d5dc096 100644
--- a/example/jsonfindptrs/jsonfindptrs.cc
+++ b/example/jsonfindptrs/jsonfindptrs.cc

@@ -433,7 +433,7 @@
       return Result(std::move(tsr.status_msg));
     }
 
-    uint64_t vbc = tsr.token.value_base_category();
+    int64_t vbc = tsr.token.value_base_category();
     uint64_t vbd = tsr.token.value_base_detail();
     switch (vbc) {
       case WUFFS_BASE__TOKEN__VBC__FILLER:
@@ -469,7 +469,7 @@
     if (!tsr.status_msg.empty()) {
       return Result(std::move(tsr.status_msg));
     }
-    uint64_t vbc = tsr.token.value_base_category();
+    int64_t vbc = tsr.token.value_base_category();
     uint64_t vbd = tsr.token.value_base_detail();
     if (vbc == WUFFS_BASE__TOKEN__VBC__FILLER) {
       ts.next();
@@ -556,7 +556,7 @@
     if (!tsr.status_msg.empty()) {
       return Result(std::move(tsr.status_msg));
     }
-    uint64_t vbc = tsr.token.value_base_category();
+    int64_t vbc = tsr.token.value_base_category();
     uint64_t vbd = tsr.token.value_base_detail();
     if (vbc == WUFFS_BASE__TOKEN__VBC__FILLER) {
       ts.next();
@@ -596,7 +596,7 @@
   JsonThing jt;
   jt.kind = Kind::String;
   while (true) {
-    uint64_t vbc = tsr.token.value_base_category();
+    int64_t vbc = tsr.token.value_base_category();
     uint64_t vbd = tsr.token.value_base_detail();
 
     switch (vbc) {

diff --git a/example/jsonptr/jsonptr.cc b/example/jsonptr/jsonptr.cc
index 9fd764f..b3270f7 100644
--- a/example/jsonptr/jsonptr.cc
+++ b/example/jsonptr/jsonptr.cc

@@ -836,7 +836,7 @@
 const char*  //
 handle_token(wuffs_base__token t) {
   do {
-    uint64_t vbc = t.value_base_category();
+    int64_t vbc = t.value_base_category();
     uint64_t vbd = t.value_base_detail();
     uint64_t len = t.length();
 

diff --git a/fuzz/c/std/json_fuzzer.c b/fuzz/c/std/json_fuzzer.c
index 3e76676..9ccbbe1 100644
--- a/fuzz/c/std/json_fuzzer.c
+++ b/fuzz/c/std/json_fuzzer.c

@@ -107,7 +107,7 @@
     return "fuzz: internal error: inconsistent link bits";
   }
 
-  uint64_t vbc = wuffs_base__token__value_base_category(&t);
+  int64_t vbc = wuffs_base__token__value_base_category(&t);
   uint64_t vbd = wuffs_base__token__value_base_detail(&t);
 
   switch (vbc) {

diff --git a/internal/cgen/base/token-public.h b/internal/cgen/base/token-public.h
index 1c4c7c2..2b85d7d 100644
--- a/internal/cgen/base/token-public.h
+++ b/internal/cgen/base/token-public.h

@@ -23,6 +23,7 @@
   // |  1  |      21     |   3   |      21     |  1  |  1  |     16    |
   // +-----+-------------+-------+-------------+-----+-----+-----------+
   // [..................value..................]  LP    LN     length
+  // [..1..|..........~value_extension.........]
   // [..0..|.value_major.|.....value_minor.....]
   // [..0..|.........VBC.........|.....VBD.....]
   //
@@ -31,8 +32,15 @@
   //  - Bits 17 .. 16 ( 2 bits) is LP and LN (link_prev and link_next).
   //  - Bits 15 ..  0 (16 bits) is the length.
   //
-  // The value bits can be sub-divided in multiple ways:
-  //  - Bits 63 .. 63 ( 1 bits) is reserved (a zero bit).
+  // ----
+  //
+  // The value bits can be sub-divided in multiple ways. First, the high bit:
+  //  - Bits 63 .. 63 ( 1 bits) is an extended (1) or simple (0) token.
+  //
+  // For extended tokens:
+  //  - Bits 62 .. 18 (45 bits) is the bitwise-not (~) of the value_extension.
+  //
+  // For simple tokens:
   //  - Bits 62 .. 42 (21 bits) is the value_major.
   //  - Bits 41 .. 18 (24 bits) is the value_minor.
   //  - Bits 62 .. 39 (24 bits) is the VBC (value_base_category).
@@ -46,13 +54,15 @@
   //
   // The high 46 bits (bits 63 .. 18) only have VBC and VBD semantics when the
   // high 22 bits (the value_major) are all zero. An equivalent test is that
-  // the high 25 bits (the notional VBC) has a numerical value less than 8.
+  // the high 25 bits (the notional VBC) has a value in the range 0 ..= 7.
   //
   // At 21 bits, the VBD can hold every valid Unicode code point.
   //
   // If value_major is non-zero then value_minor has whatever arbitrary meaning
   // the tokenizer's package assigns to it.
   //
+  // ----
+  //
   // Multiple consecutive tokens can form a larger conceptual unit. For
   // example, an "abc\tz" string is a single higher level concept but at the
   // lower level, it could consist of multiple tokens: the quotes '"', the
@@ -66,10 +76,11 @@
   uint64_t repr;
 
 #ifdef __cplusplus
-  inline uint64_t value() const;
-  inline uint64_t value_major() const;
+  inline int64_t value() const;
+  inline int64_t value_extension() const;
+  inline int64_t value_major() const;
+  inline int64_t value_base_category() const;
   inline uint64_t value_minor() const;
-  inline uint64_t value_base_category() const;
   inline uint64_t value_base_detail() const;
   inline bool link_prev() const;
   inline bool link_next() const;
@@ -89,18 +100,11 @@
 
 #define WUFFS_BASE__TOKEN__LENGTH__MAX_INCL 0xFFFF
 
-#define WUFFS_BASE__TOKEN__VALUE__MASK 0x3FFFFFFFFFFF
-#define WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK 0x3FFFFF
-#define WUFFS_BASE__TOKEN__VALUE_MINOR__MASK 0xFFFFFF
-#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK 0x1FFFFFF
-#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK 0x1FFFFF
-#define WUFFS_BASE__TOKEN__LINK__MASK 0x3
-#define WUFFS_BASE__TOKEN__LENGTH__MASK 0xFFFF
-
 #define WUFFS_BASE__TOKEN__VALUE__SHIFT 18
+#define WUFFS_BASE__TOKEN__VALUE_EXTENSION__SHIFT 18
 #define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 42
-#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18
 #define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 39
+#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18
 #define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 18
 #define WUFFS_BASE__TOKEN__LINK__SHIFT 16
 #define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0
@@ -194,34 +198,42 @@
 
 // --------
 
-static inline uint64_t  //
+// wuffs_base__token__value returns the token's high 46 bits, sign-extended. A
+// negative value means an extended token, non-negative means a simple token.
+static inline int64_t  //
 wuffs_base__token__value(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE__MASK;
+  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE__SHIFT;
 }
 
-static inline uint64_t  //
+// wuffs_base__token__value_extension returns a negative value if the token was
+// not an extended token.
+static inline int64_t  //
+wuffs_base__token__value_extension(const wuffs_base__token* t) {
+  return (~(int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_EXTENSION__SHIFT;
+}
+
+// wuffs_base__token__value_major returns a negative value if the token was not
+// a simple token.
+static inline int64_t  //
 wuffs_base__token__value_major(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK;
+  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT;
+}
+
+// wuffs_base__token__value_base_category returns a negative value if the token
+// was not a simple token.
+static inline int64_t  //
+wuffs_base__token__value_base_category(const wuffs_base__token* t) {
+  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT;
 }
 
 static inline uint64_t  //
 wuffs_base__token__value_minor(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE_MINOR__MASK;
-}
-
-static inline uint64_t  //
-wuffs_base__token__value_base_category(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK;
+  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) & 0xFFFFFF;
 }
 
 static inline uint64_t  //
 wuffs_base__token__value_base_detail(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK;
+  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) & 0x1FFFFF;
 }
 
 static inline bool  //
@@ -236,33 +248,37 @@
 
 static inline uint64_t  //
 wuffs_base__token__length(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) &
-         WUFFS_BASE__TOKEN__LENGTH__MASK;
+  return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) & 0xFFFF;
 }
 
 #ifdef __cplusplus
 
-inline uint64_t  //
+inline int64_t  //
 wuffs_base__token::value() const {
   return wuffs_base__token__value(this);
 }
 
-inline uint64_t  //
+inline int64_t  //
+wuffs_base__token::value_extension() const {
+  return wuffs_base__token__value_extension(this);
+}
+
+inline int64_t  //
 wuffs_base__token::value_major() const {
   return wuffs_base__token__value_major(this);
 }
 
+inline int64_t  //
+wuffs_base__token::value_base_category() const {
+  return wuffs_base__token__value_base_category(this);
+}
+
 inline uint64_t  //
 wuffs_base__token::value_minor() const {
   return wuffs_base__token__value_minor(this);
 }
 
 inline uint64_t  //
-wuffs_base__token::value_base_category() const {
-  return wuffs_base__token__value_base_category(this);
-}
-
-inline uint64_t  //
 wuffs_base__token::value_base_detail() const {
   return wuffs_base__token__value_base_detail(this);
 }

diff --git a/internal/cgen/data.go b/internal/cgen/data.go
index 437a056..3e5b0bd 100644
--- a/internal/cgen/data.go
+++ b/internal/cgen/data.go

@@ -381,11 +381,15 @@
 	""
 
 const baseTokenPublicH = "" +
-	"// ---------------- Tokens\n\ntypedef struct {\n  // The repr's 64 bits are divided as:\n  //\n  // +-----+-------------+-------+-------------+-----+-----+-----------+\n  // |  1  |      21     |   3   |      21     |  1  |  1  |     16    |\n  // +-----+-------------+-------+-------------+-----+-----+-----------+\n  // [..................value..................]  LP    LN     length\n  // [..0..|.value_major.|.....value_minor.....]\n  // [..0..|.........VBC.........|.....VBD.....]\n  //\n  // The broad divisions are:\n  //  - Bits 63 .. 18 (46 bits) is the value.\n  //  - Bits 17 .. 16 ( 2 bits) is LP and LN (link_prev and link_next).\n  //  - Bits 15 ..  0 (16 bits) is the length.\n  //\n  // The value bits can be sub-divided in multiple ways:\n  //  - Bits 63 .. 63 ( 1 bits) is reserved (a zero bit).\n  //  - Bits 62 .. 42 (21 bits) is the value_major.\n  //  - Bits 41 .. 18 (24 bits) is the value_minor.\n  //  - Bits 62 .. 39 (24 bits) is the VBC (value_base_category).\n  //  - Bits 38 .. 18 (21 bits) is the VBD (value_base_de" +
-	"tail).\n  //\n  // The value_major is a 21-bit [Base38](doc/note/base38-and-fourcc.md) value.\n  // If all of its bits are zero (special cased for Wuffs' built-in \"base\"\n  // package) then the value_minor is further sub-divided:\n  //  - Bits 41 .. 39 ( 3 bits) is the VBC (value_base_category).\n  //  - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).\n  //\n  // The high 46 bits (bits 63 .. 18) only have VBC and VBD semantics when the\n  // high 22 bits (the value_major) are all zero. An equivalent test is that\n  // the high 25 bits (the notional VBC) has a numerical value less than 8.\n  //\n  // At 21 bits, the VBD can hold every valid Unicode code point.\n  //\n  // If value_major is non-zero then value_minor has whatever arbitrary meaning\n  // the tokenizer's package assigns to it.\n  //\n  // Multiple consecutive tokens can form a larger conceptual unit. For\n  // example, an \"abc\\tz\" string is a single higher level concept but at the\n  // lower level, it could consist of multiple tokens: the quotes '\"', the\n  " +
-	"// ASCII texts \"abc\" and \"z\" and the backslash-escaped tab '\\t'. The LP and\n  // LN (link_prev and link_next) bits denote tokens that are part of a\n  // multi-token chain:\n  //  - LP means that this token is not the first (there is a previous token).\n  //  - LN means that this token is not the last  (there is a next     token).\n  //\n  // In particular, a stand-alone token will have both link bits set to zero.\n  uint64_t repr;\n\n#ifdef __cplusplus\n  inline uint64_t value() const;\n  inline uint64_t value_major() const;\n  inline uint64_t value_minor() const;\n  inline uint64_t value_base_category() const;\n  inline uint64_t value_base_detail() const;\n  inline bool link_prev() const;\n  inline bool link_next() const;\n  inline uint64_t length() const;\n#endif  // __cplusplus\n\n} wuffs_base__token;\n\nstatic inline wuffs_base__token  //\nwuffs_base__make_token(uint64_t repr) {\n  wuffs_base__token ret;\n  ret.repr = repr;\n  return ret;\n}\n\n  " +
+	"// ---------------- Tokens\n\ntypedef struct {\n  // The repr's 64 bits are divided as:\n  //\n  // +-----+-------------+-------+-------------+-----+-----+-----------+\n  // |  1  |      21     |   3   |      21     |  1  |  1  |     16    |\n  // +-----+-------------+-------+-------------+-----+-----+-----------+\n  // [..................value..................]  LP    LN     length\n  // [..1..|..........~value_extension.........]\n  // [..0..|.value_major.|.....value_minor.....]\n  // [..0..|.........VBC.........|.....VBD.....]\n  //\n  // The broad divisions are:\n  //  - Bits 63 .. 18 (46 bits) is the value.\n  //  - Bits 17 .. 16 ( 2 bits) is LP and LN (link_prev and link_next).\n  //  - Bits 15 ..  0 (16 bits) is the length.\n  //\n  " +
 	"" +
-	"// --------\n\n#define WUFFS_BASE__TOKEN__LENGTH__MAX_INCL 0xFFFF\n\n#define WUFFS_BASE__TOKEN__VALUE__MASK 0x3FFFFFFFFFFF\n#define WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK 0x3FFFFF\n#define WUFFS_BASE__TOKEN__VALUE_MINOR__MASK 0xFFFFFF\n#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK 0x1FFFFFF\n#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK 0x1FFFFF\n#define WUFFS_BASE__TOKEN__LINK__MASK 0x3\n#define WUFFS_BASE__TOKEN__LENGTH__MASK 0xFFFF\n\n#define WUFFS_BASE__TOKEN__VALUE__SHIFT 18\n#define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 42\n#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18\n#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 39\n#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 18\n#define WUFFS_BASE__TOKEN__LINK__SHIFT 16\n#define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0\n\n#define WUFFS_BASE__TOKEN__LINK_PREV 0x20000\n#define WUFFS_BASE__TOKEN__LINK_NEXT 0x10000\n\n  " +
+	"// ----\n  //\n  // The value bits can be sub-divided in multiple ways. First, the high bit:\n  //  - Bits 63 .. 63 ( 1 bits) is an extended (1) or simple (0) token.\n  //\n  // For extended tokens:\n  //  - Bits 62 .. 18 (45 bits) is the bitwise-not (~) of the value_extension.\n  //\n  // For simple tokens:\n  //  - Bits 62 .. 42 (21 bits) is the value_major.\n  //  - Bits 41 .. 18 (24 bits) is the value_minor.\n  //  - Bits 62 .. 39 (24 bits) is the VBC (value_base_category).\n  //  - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).\n  //\n  // The value_major is a 21-bit [Base38](doc/note/base38-and-fourcc.md) value.\n  // If all of its bits are zero (special cased for Wuffs' built-in \"base\"\n  // package) then the value_minor is further sub-divided:\n  //  - Bits 41 .. 39 ( 3 bits) is the VBC (value_base_category).\n  //  - Bits 38 .. 18 (21 bits) is the VBD (value_base_detail).\n  //\n  // The high 46 bits (bits 63 .. 18) only have VBC and VBD semantics when the\n  // high 22 bits (the value_major) are all zero. An eq" +
+	"uivalent test is that\n  // the high 25 bits (the notional VBC) has a value in the range 0 ..= 7.\n  //\n  // At 21 bits, the VBD can hold every valid Unicode code point.\n  //\n  // If value_major is non-zero then value_minor has whatever arbitrary meaning\n  // the tokenizer's package assigns to it.\n  //\n  " +
+	"" +
+	"// ----\n  //\n  // Multiple consecutive tokens can form a larger conceptual unit. For\n  // example, an \"abc\\tz\" string is a single higher level concept but at the\n  // lower level, it could consist of multiple tokens: the quotes '\"', the\n  // ASCII texts \"abc\" and \"z\" and the backslash-escaped tab '\\t'. The LP and\n  // LN (link_prev and link_next) bits denote tokens that are part of a\n  // multi-token chain:\n  //  - LP means that this token is not the first (there is a previous token).\n  //  - LN means that this token is not the last  (there is a next     token).\n  //\n  // In particular, a stand-alone token will have both link bits set to zero.\n  uint64_t repr;\n\n#ifdef __cplusplus\n  inline int64_t value() const;\n  inline int64_t value_extension() const;\n  inline int64_t value_major() const;\n  inline int64_t value_base_category() const;\n  inline uint64_t value_minor() const;\n  inline uint64_t value_base_detail() const;\n  inline bool link_prev() const;\n  inline bool link_next() const;\n  inline uint64_t length() " +
+	"const;\n#endif  // __cplusplus\n\n} wuffs_base__token;\n\nstatic inline wuffs_base__token  //\nwuffs_base__make_token(uint64_t repr) {\n  wuffs_base__token ret;\n  ret.repr = repr;\n  return ret;\n}\n\n  " +
+	"" +
+	"// --------\n\n#define WUFFS_BASE__TOKEN__LENGTH__MAX_INCL 0xFFFF\n\n#define WUFFS_BASE__TOKEN__VALUE__SHIFT 18\n#define WUFFS_BASE__TOKEN__VALUE_EXTENSION__SHIFT 18\n#define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 42\n#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 39\n#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18\n#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 18\n#define WUFFS_BASE__TOKEN__LINK__SHIFT 16\n#define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0\n\n#define WUFFS_BASE__TOKEN__LINK_PREV 0x20000\n#define WUFFS_BASE__TOKEN__LINK_NEXT 0x10000\n\n  " +
 	"" +
 	"// --------\n\n#define WUFFS_BASE__TOKEN__VBC__FILLER 0\n#define WUFFS_BASE__TOKEN__VBC__STRUCTURE 1\n#define WUFFS_BASE__TOKEN__VBC__STRING 2\n#define WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT 3\n#define WUFFS_BASE__TOKEN__VBC__LITERAL 4\n#define WUFFS_BASE__TOKEN__VBC__NUMBER 5\n\n  " +
 	"" +
@@ -401,9 +405,9 @@
 	"// --------\n\n// For a source string of \"123\" or \"0x9A\", it is valid for a tokenizer to\n// return any one of:\n//  - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT.\n//  - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_SIGNED.\n//  - WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_UNSIGNED.\n//\n// For a source string of \"+123\" or \"-0x9A\", only the first two are valid.\n//\n// For a source string of \"123.\", only the first one is valid.\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_FLOATING_POINT 0x00001\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_SIGNED 0x00002\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_INTEGER_UNSIGNED 0x00004\n\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_NEG_INF 0x00010\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_POS_INF 0x00020\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_NEG_NAN 0x00040\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__CONTENT_POS_NAN 0x00080\n\n// The number 300 might be represented as \"\\x01\\x2C\", \"\\x2C\\x01\\x00\\x00\" or\n// \"300\", which are big-endian, li" +
 	"ttle-endian or text. For binary formats, the\n// token length discriminates e.g. u16 little-endian vs u32 little-endian.\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_BINARY_BIG_ENDIAN 0x00100\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_BINARY_LITTLE_ENDIAN 0x00200\n#define WUFFS_BASE__TOKEN__VBD__NUMBER__FORMAT_TEXT 0x00400\n\n" +
 	"" +
-	"// --------\n\nstatic inline uint64_t  //\nwuffs_base__token__value(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__VALUE__SHIFT) &\n         WUFFS_BASE__TOKEN__VALUE__MASK;\n}\n\nstatic inline uint64_t  //\nwuffs_base__token__value_major(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT) &\n         WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK;\n}\n\nstatic inline uint64_t  //\nwuffs_base__token__value_minor(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) &\n         WUFFS_BASE__TOKEN__VALUE_MINOR__MASK;\n}\n\nstatic inline uint64_t  //\nwuffs_base__token__value_base_category(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT) &\n         WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK;\n}\n\nstatic inline uint64_t  //\nwuffs_base__token__value_base_detail(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) &\n         WUFFS_BASE__TOKEN__VALUE_BASE_DETA" +
-	"IL__MASK;\n}\n\nstatic inline bool  //\nwuffs_base__token__link_prev(const wuffs_base__token* t) {\n  return t->repr & WUFFS_BASE__TOKEN__LINK_PREV;\n}\n\nstatic inline bool  //\nwuffs_base__token__link_next(const wuffs_base__token* t) {\n  return t->repr & WUFFS_BASE__TOKEN__LINK_NEXT;\n}\n\nstatic inline uint64_t  //\nwuffs_base__token__length(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) &\n         WUFFS_BASE__TOKEN__LENGTH__MASK;\n}\n\n#ifdef __cplusplus\n\ninline uint64_t  //\nwuffs_base__token::value() const {\n  return wuffs_base__token__value(this);\n}\n\ninline uint64_t  //\nwuffs_base__token::value_major() const {\n  return wuffs_base__token__value_major(this);\n}\n\ninline uint64_t  //\nwuffs_base__token::value_minor() const {\n  return wuffs_base__token__value_minor(this);\n}\n\ninline uint64_t  //\nwuffs_base__token::value_base_category() const {\n  return wuffs_base__token__value_base_category(this);\n}\n\ninline uint64_t  //\nwuffs_base__token::value_base_detail() const {\n  return wuffs_base__to" +
-	"ken__value_base_detail(this);\n}\n\ninline bool  //\nwuffs_base__token::link_prev() const {\n  return wuffs_base__token__link_prev(this);\n}\n\ninline bool  //\nwuffs_base__token::link_next() const {\n  return wuffs_base__token__link_next(this);\n}\n\ninline uint64_t  //\nwuffs_base__token::length() const {\n  return wuffs_base__token__length(this);\n}\n\n#endif  // __cplusplus\n\n" +
+	"// --------\n\n// wuffs_base__token__value returns the token's high 46 bits, sign-extended. A\n// negative value means an extended token, non-negative means a simple token.\nstatic inline int64_t  //\nwuffs_base__token__value(const wuffs_base__token* t) {\n  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE__SHIFT;\n}\n\n// wuffs_base__token__value_extension returns a negative value if the token was\n// not an extended token.\nstatic inline int64_t  //\nwuffs_base__token__value_extension(const wuffs_base__token* t) {\n  return (~(int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_EXTENSION__SHIFT;\n}\n\n// wuffs_base__token__value_major returns a negative value if the token was not\n// a simple token.\nstatic inline int64_t  //\nwuffs_base__token__value_major(const wuffs_base__token* t) {\n  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT;\n}\n\n// wuffs_base__token__value_base_category returns a negative value if the token\n// was not a simple token.\nstatic inline int64_t  //\nwuffs_base__token__value_base_cat" +
+	"egory(const wuffs_base__token* t) {\n  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT;\n}\n\nstatic inline uint64_t  //\nwuffs_base__token__value_minor(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) & 0xFFFFFF;\n}\n\nstatic inline uint64_t  //\nwuffs_base__token__value_base_detail(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) & 0x1FFFFF;\n}\n\nstatic inline bool  //\nwuffs_base__token__link_prev(const wuffs_base__token* t) {\n  return t->repr & WUFFS_BASE__TOKEN__LINK_PREV;\n}\n\nstatic inline bool  //\nwuffs_base__token__link_next(const wuffs_base__token* t) {\n  return t->repr & WUFFS_BASE__TOKEN__LINK_NEXT;\n}\n\nstatic inline uint64_t  //\nwuffs_base__token__length(const wuffs_base__token* t) {\n  return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) & 0xFFFF;\n}\n\n#ifdef __cplusplus\n\ninline int64_t  //\nwuffs_base__token::value() const {\n  return wuffs_base__token__value(this);\n}\n\ninline int64_t  //\nwuffs_base_" +
+	"_token::value_extension() const {\n  return wuffs_base__token__value_extension(this);\n}\n\ninline int64_t  //\nwuffs_base__token::value_major() const {\n  return wuffs_base__token__value_major(this);\n}\n\ninline int64_t  //\nwuffs_base__token::value_base_category() const {\n  return wuffs_base__token__value_base_category(this);\n}\n\ninline uint64_t  //\nwuffs_base__token::value_minor() const {\n  return wuffs_base__token__value_minor(this);\n}\n\ninline uint64_t  //\nwuffs_base__token::value_base_detail() const {\n  return wuffs_base__token__value_base_detail(this);\n}\n\ninline bool  //\nwuffs_base__token::link_prev() const {\n  return wuffs_base__token__link_prev(this);\n}\n\ninline bool  //\nwuffs_base__token::link_next() const {\n  return wuffs_base__token__link_next(this);\n}\n\ninline uint64_t  //\nwuffs_base__token::length() const {\n  return wuffs_base__token__length(this);\n}\n\n#endif  // __cplusplus\n\n" +
 	"" +
 	"// --------\n\ntypedef WUFFS_BASE__SLICE(wuffs_base__token) wuffs_base__slice_token;\n\nstatic inline wuffs_base__slice_token  //\nwuffs_base__make_slice_token(wuffs_base__token* ptr, size_t len) {\n  wuffs_base__slice_token ret;\n  ret.ptr = ptr;\n  ret.len = len;\n  return ret;\n}\n\n" +
 	"" +

diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index e139fb5..81e6283 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c

@@ -1856,6 +1856,7 @@
   // |  1  |      21     |   3   |      21     |  1  |  1  |     16    |
   // +-----+-------------+-------+-------------+-----+-----+-----------+
   // [..................value..................]  LP    LN     length
+  // [..1..|..........~value_extension.........]
   // [..0..|.value_major.|.....value_minor.....]
   // [..0..|.........VBC.........|.....VBD.....]
   //
@@ -1864,8 +1865,15 @@
   //  - Bits 17 .. 16 ( 2 bits) is LP and LN (link_prev and link_next).
   //  - Bits 15 ..  0 (16 bits) is the length.
   //
-  // The value bits can be sub-divided in multiple ways:
-  //  - Bits 63 .. 63 ( 1 bits) is reserved (a zero bit).
+  // ----
+  //
+  // The value bits can be sub-divided in multiple ways. First, the high bit:
+  //  - Bits 63 .. 63 ( 1 bits) is an extended (1) or simple (0) token.
+  //
+  // For extended tokens:
+  //  - Bits 62 .. 18 (45 bits) is the bitwise-not (~) of the value_extension.
+  //
+  // For simple tokens:
   //  - Bits 62 .. 42 (21 bits) is the value_major.
   //  - Bits 41 .. 18 (24 bits) is the value_minor.
   //  - Bits 62 .. 39 (24 bits) is the VBC (value_base_category).
@@ -1879,13 +1887,15 @@
   //
   // The high 46 bits (bits 63 .. 18) only have VBC and VBD semantics when the
   // high 22 bits (the value_major) are all zero. An equivalent test is that
-  // the high 25 bits (the notional VBC) has a numerical value less than 8.
+  // the high 25 bits (the notional VBC) has a value in the range 0 ..= 7.
   //
   // At 21 bits, the VBD can hold every valid Unicode code point.
   //
   // If value_major is non-zero then value_minor has whatever arbitrary meaning
   // the tokenizer's package assigns to it.
   //
+  // ----
+  //
   // Multiple consecutive tokens can form a larger conceptual unit. For
   // example, an "abc\tz" string is a single higher level concept but at the
   // lower level, it could consist of multiple tokens: the quotes '"', the
@@ -1899,10 +1909,11 @@
   uint64_t repr;
 
 #ifdef __cplusplus
-  inline uint64_t value() const;
-  inline uint64_t value_major() const;
+  inline int64_t value() const;
+  inline int64_t value_extension() const;
+  inline int64_t value_major() const;
+  inline int64_t value_base_category() const;
   inline uint64_t value_minor() const;
-  inline uint64_t value_base_category() const;
   inline uint64_t value_base_detail() const;
   inline bool link_prev() const;
   inline bool link_next() const;
@@ -1922,18 +1933,11 @@
 
 #define WUFFS_BASE__TOKEN__LENGTH__MAX_INCL 0xFFFF
 
-#define WUFFS_BASE__TOKEN__VALUE__MASK 0x3FFFFFFFFFFF
-#define WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK 0x3FFFFF
-#define WUFFS_BASE__TOKEN__VALUE_MINOR__MASK 0xFFFFFF
-#define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK 0x1FFFFFF
-#define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK 0x1FFFFF
-#define WUFFS_BASE__TOKEN__LINK__MASK 0x3
-#define WUFFS_BASE__TOKEN__LENGTH__MASK 0xFFFF
-
 #define WUFFS_BASE__TOKEN__VALUE__SHIFT 18
+#define WUFFS_BASE__TOKEN__VALUE_EXTENSION__SHIFT 18
 #define WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT 42
-#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18
 #define WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT 39
+#define WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT 18
 #define WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT 18
 #define WUFFS_BASE__TOKEN__LINK__SHIFT 16
 #define WUFFS_BASE__TOKEN__LENGTH__SHIFT 0
@@ -2027,34 +2031,42 @@
 
 // --------
 
-static inline uint64_t  //
+// wuffs_base__token__value returns the token's high 46 bits, sign-extended. A
+// negative value means an extended token, non-negative means a simple token.
+static inline int64_t  //
 wuffs_base__token__value(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE__MASK;
+  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE__SHIFT;
 }
 
-static inline uint64_t  //
+// wuffs_base__token__value_extension returns a negative value if the token was
+// not an extended token.
+static inline int64_t  //
+wuffs_base__token__value_extension(const wuffs_base__token* t) {
+  return (~(int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_EXTENSION__SHIFT;
+}
+
+// wuffs_base__token__value_major returns a negative value if the token was not
+// a simple token.
+static inline int64_t  //
 wuffs_base__token__value_major(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE_MAJOR__MASK;
+  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_MAJOR__SHIFT;
+}
+
+// wuffs_base__token__value_base_category returns a negative value if the token
+// was not a simple token.
+static inline int64_t  //
+wuffs_base__token__value_base_category(const wuffs_base__token* t) {
+  return ((int64_t)(t->repr)) >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT;
 }
 
 static inline uint64_t  //
 wuffs_base__token__value_minor(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE_MINOR__MASK;
-}
-
-static inline uint64_t  //
-wuffs_base__token__value_base_category(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE_BASE_CATEGORY__MASK;
+  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) & 0xFFFFFF;
 }
 
 static inline uint64_t  //
 wuffs_base__token__value_base_detail(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) &
-         WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__MASK;
+  return (t->repr >> WUFFS_BASE__TOKEN__VALUE_BASE_DETAIL__SHIFT) & 0x1FFFFF;
 }
 
 static inline bool  //
@@ -2069,33 +2081,37 @@
 
 static inline uint64_t  //
 wuffs_base__token__length(const wuffs_base__token* t) {
-  return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) &
-         WUFFS_BASE__TOKEN__LENGTH__MASK;
+  return (t->repr >> WUFFS_BASE__TOKEN__LENGTH__SHIFT) & 0xFFFF;
 }
 
 #ifdef __cplusplus
 
-inline uint64_t  //
+inline int64_t  //
 wuffs_base__token::value() const {
   return wuffs_base__token__value(this);
 }
 
-inline uint64_t  //
+inline int64_t  //
+wuffs_base__token::value_extension() const {
+  return wuffs_base__token__value_extension(this);
+}
+
+inline int64_t  //
 wuffs_base__token::value_major() const {
   return wuffs_base__token__value_major(this);
 }
 
+inline int64_t  //
+wuffs_base__token::value_base_category() const {
+  return wuffs_base__token__value_base_category(this);
+}
+
 inline uint64_t  //
 wuffs_base__token::value_minor() const {
   return wuffs_base__token__value_minor(this);
 }
 
 inline uint64_t  //
-wuffs_base__token::value_base_category() const {
-  return wuffs_base__token__value_base_category(this);
-}
-
-inline uint64_t  //
 wuffs_base__token::value_base_detail() const {
   return wuffs_base__token__value_base_detail(this);
 }

diff --git a/test/c/std/json.c b/test/c/std/json.c
index 93786d5..36054f6 100644
--- a/test/c/std/json.c
+++ b/test/c/std/json.c

@@ -1376,7 +1376,7 @@
           size_t have = 0;
           while (tok.meta.ri < tok.meta.wi) {
             wuffs_base__token* t = &tok.data.ptr[tok.meta.ri++];
-            uint64_t vbc = wuffs_base__token__value_base_category(t);
+            int64_t vbc = wuffs_base__token__value_base_category(t);
             if (vbc == WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT) {
               break;
             } else if (vbc == WUFFS_BASE__TOKEN__VBC__STRING) {
@@ -1483,7 +1483,7 @@
       uint32_t have = 0;
       while (tok.meta.ri < tok.meta.wi) {
         wuffs_base__token* t = &tok.data.ptr[tok.meta.ri++];
-        uint64_t vbc = wuffs_base__token__value_base_category(t);
+        int64_t vbc = wuffs_base__token__value_base_category(t);
         uint64_t vbd = wuffs_base__token__value_base_detail(t);
         if (vbc == WUFFS_BASE__TOKEN__VBC__UNICODE_CODE_POINT) {
           have = vbd;
@@ -1553,7 +1553,7 @@
     uint64_t have_bytes = 0;
     while (tok.meta.ri < tok.meta.wi) {
       wuffs_base__token* t = &tok.data.ptr[tok.meta.ri++];
-      uint64_t vbc = wuffs_base__token__value_base_category(t);
+      int64_t vbc = wuffs_base__token__value_base_category(t);
       uint64_t vbd = wuffs_base__token__value_base_detail(t);
       uint64_t token_length = wuffs_base__token__length(t);
       if ((vbc == WUFFS_BASE__TOKEN__VBC__STRING) &&
@@ -2030,7 +2030,7 @@
     uint64_t src_index = 0;
     while (tok.meta.ri < tok.meta.wi) {
       wuffs_base__token* t = &tok.data.ptr[tok.meta.ri++];
-      uint64_t vbc = wuffs_base__token__value_base_category(t);
+      int64_t vbc = wuffs_base__token__value_base_category(t);
       uint64_t vbd = wuffs_base__token__value_base_detail(t);
       uint64_t token_length = wuffs_base__token__length(t);
commit	462f86675d89db5b6f25793c31e5e466943eb4ae	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Wed Apr 01 23:01:51 2020 +1100
committer	Nigel Tao <nigeltao@golang.org>	Thu Apr 02 11:20:45 2020 +1100
tree	a66fdc2c35e50c8865d6984661b67318d2a5cdbc
parent	7d684f94ae1fc6444202a7bfa7fb778a293a9c89 [diff]