blob: 7696e2686e4cd3687ec8c5d4b5b41d762adfea8e [file] [log] [blame]
<!-- HTML header for doxygen 1.8.7-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.16"/>
<title>RapidJSON: Encoding</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(initResizable);
/* @license-end */</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(function() { init_search(); });
/* @license-end */
</script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="doxygenextra.css" rel="stylesheet" type="text/css"/>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="topbanner"><a href="https://github.com/Tencent/rapidjson" title="RapidJSON GitHub"><i class="githublogo"></i></a></div>
<div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.16 -->
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
var searchBox = new SearchBox("searchBox", "search",false,'Search');
/* @license-end */
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(function(){initNavTree('md_doc_encoding.html','');});
/* @license-end */
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="PageDoc"><div class="header">
<div class="headertitle">
<div class="title">Encoding </div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><h3>Table of Contents</h3>
<ul><li class="level1"><a href="#Unicode">Unicode</a><ul><li class="level2"><a href="#UTF">Unicode Transformation Format</a></li>
<li class="level2"><a href="#CharacterType">Character Type</a></li>
<li class="level2"><a href="#AutoUTF">AutoUTF</a></li>
<li class="level2"><a href="#ASCII">ASCII</a></li>
</ul>
</li>
<li class="level1"><a href="#ValidationTranscoding">Validation &amp; Transcoding</a><ul><li class="level2"><a href="#Transcoder">Transcoder</a></li>
</ul>
</li>
</ul>
</div>
<div class="textblock"><p>According to <a href="http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf">ECMA-404</a>,</p>
<blockquote class="doxtable">
<p>(in Introduction) JSON text is a sequence of Unicode code points. </p>
</blockquote>
<p>The earlier <a href="http://www.ietf.org/rfc/rfc4627.txt">RFC4627</a> stated that,</p>
<blockquote class="doxtable">
<p>(in §3) JSON text SHALL be encoded in Unicode. The default encoding is UTF-8. </p>
</blockquote>
<blockquote class="doxtable">
<p>(in §6) JSON may be represented using UTF-8, UTF-16, or UTF-32. When JSON is written in UTF-8, JSON is 8bit compatible. When JSON is written in UTF-16 or UTF-32, the binary content-transfer-encoding must be used. </p>
</blockquote>
<p>RapidJSON supports various encodings. It can also validate the encodings of JSON, and transcoding JSON among encodings. All these features are implemented internally, without the need for external libraries (e.g. <a href="http://site.icu-project.org/">ICU</a>).</p>
<h1><a class="anchor" id="Unicode"></a>
Unicode</h1>
<p>From <a href="http://www.unicode.org/standard/WhatIsUnicode.html">Unicode's official website</a>: </p><blockquote class="doxtable">
<p>Unicode provides a unique number for every character, no matter what the platform, no matter what the program, no matter what the language. </p>
</blockquote>
<p>Those unique numbers are called code points, which is in the range <code>0x0</code> to <code>0x10FFFF</code>.</p>
<h2><a class="anchor" id="UTF"></a>
Unicode Transformation Format</h2>
<p>There are various encodings for storing Unicode code points. These are called Unicode Transformation Format (UTF). RapidJSON supports the most commonly used UTFs, including</p>
<ul>
<li>UTF-8: 8-bit variable-width encoding. It maps a code point to 1–4 bytes.</li>
<li>UTF-16: 16-bit variable-width encoding. It maps a code point to 1–2 16-bit code units (i.e., 2–4 bytes).</li>
<li>UTF-32: 32-bit fixed-width encoding. It directly maps a code point to a single 32-bit code unit (i.e. 4 bytes).</li>
</ul>
<p>For UTF-16 and UTF-32, the byte order (endianness) does matter. Within computer memory, they are often stored in the computer's endianness. However, when it is stored in file or transferred over network, we need to state the byte order of the byte sequence, either little-endian (LE) or big-endian (BE).</p>
<p>RapidJSON provide these encodings via the structs in <code><a class="el" href="encodings_8h_source.html">rapidjson/encodings.h</a></code>:</p>
<div class="fragment"><div class="line"><span class="keyword">namespace </span><a class="code" href="namespacerapidjson.html">rapidjson</a> {</div>
<div class="line"> </div>
<div class="line"><span class="keyword">template</span>&lt;<span class="keyword">typename</span> CharType = <span class="keywordtype">char</span>&gt;</div>
<div class="line"><span class="keyword">struct </span>UTF8;</div>
<div class="line"> </div>
<div class="line"><span class="keyword">template</span>&lt;<span class="keyword">typename</span> CharType = <span class="keywordtype">wchar_t</span>&gt;</div>
<div class="line"><span class="keyword">struct </span>UTF16;</div>
<div class="line"> </div>
<div class="line"><span class="keyword">template</span>&lt;<span class="keyword">typename</span> CharType = <span class="keywordtype">wchar_t</span>&gt;</div>
<div class="line"><span class="keyword">struct </span>UTF16LE;</div>
<div class="line"> </div>
<div class="line"><span class="keyword">template</span>&lt;<span class="keyword">typename</span> CharType = <span class="keywordtype">wchar_t</span>&gt;</div>
<div class="line"><span class="keyword">struct </span>UTF16BE;</div>
<div class="line"> </div>
<div class="line"><span class="keyword">template</span>&lt;<span class="keyword">typename</span> CharType = <span class="keywordtype">unsigned</span>&gt;</div>
<div class="line"><span class="keyword">struct </span>UTF32;</div>
<div class="line"> </div>
<div class="line"><span class="keyword">template</span>&lt;<span class="keyword">typename</span> CharType = <span class="keywordtype">unsigned</span>&gt;</div>
<div class="line"><span class="keyword">struct </span>UTF32LE;</div>
<div class="line"> </div>
<div class="line"><span class="keyword">template</span>&lt;<span class="keyword">typename</span> CharType = <span class="keywordtype">unsigned</span>&gt;</div>
<div class="line"><span class="keyword">struct </span>UTF32BE;</div>
<div class="line"> </div>
<div class="line">} <span class="comment">// namespace rapidjson</span></div>
</div><!-- fragment --><p>For processing text in memory, we normally use <code>UTF8</code>, <code>UTF16</code> or <code>UTF32</code>. For processing text via I/O, we may use <code>UTF8</code>, <code>UTF16LE</code>, <code>UTF16BE</code>, <code>UTF32LE</code> or <code>UTF32BE</code>.</p>
<p>When using the DOM-style API, the <code>Encoding</code> template parameter in <code>GenericValue&lt;Encoding&gt;</code> and <code>GenericDocument&lt;Encoding&gt;</code> indicates the encoding to be used to represent JSON string in memory. So normally we will use <code>UTF8</code>, <code>UTF16</code> or <code>UTF32</code> for this template parameter. The choice depends on operating systems and other libraries that the application is using. For example, Windows API represents Unicode characters in UTF-16, while most Linux distributions and applications prefer UTF-8.</p>
<p>Example of UTF-16 DOM declaration:</p>
<div class="fragment"><div class="line"><span class="keyword">typedef</span> GenericDocument&lt;UTF16&lt;&gt; &gt; WDocument;</div>
<div class="line"><span class="keyword">typedef</span> GenericValue&lt;UTF16&lt;&gt; &gt; WValue;</div>
</div><!-- fragment --><p>For a detail example, please check the example in <a class="el" href="md_doc_stream.html">DOM's Encoding</a> section.</p>
<h2><a class="anchor" id="CharacterType"></a>
Character Type</h2>
<p>As shown in the declaration, each encoding has a <code>CharType</code> template parameter. Actually, it may be a little bit confusing, but each <code>CharType</code> stores a code unit, not a character (code point). As mentioned in previous section, a code point may be encoded to 1–4 code units for UTF-8.</p>
<p>For <code>UTF16(LE|BE)</code>, <code>UTF32(LE|BE)</code>, the <code>CharType</code> must be integer type of at least 2 and 4 bytes respectively.</p>
<p>Note that C++11 introduces <code>char16_t</code> and <code>char32_t</code>, which can be used for <code>UTF16</code> and <code>UTF32</code> respectively.</p>
<h2><a class="anchor" id="AutoUTF"></a>
AutoUTF</h2>
<p>Previous encodings are statically bound in compile-time. In other words, user must know exactly which encodings will be used in the memory or streams. However, sometimes we may need to read/write files of different encodings. The encoding needed to be decided in runtime.</p>
<p><code>AutoUTF</code> is an encoding designed for this purpose. It chooses which encoding to be used according to the input or output stream. Currently, it should be used with <code>EncodedInputStream</code> and <code>EncodedOutputStream</code>.</p>
<h2><a class="anchor" id="ASCII"></a>
ASCII</h2>
<p>Although the JSON standards did not mention about <a href="http://en.wikipedia.org/wiki/ASCII">ASCII</a>, sometimes we would like to write 7-bit ASCII JSON for applications that cannot handle UTF-8. Since any JSON can represent unicode characters in escaped sequence <code>\uXXXX</code>, JSON can always be encoded in ASCII.</p>
<p>Here is an example for writing a UTF-8 DOM into ASCII:</p>
<div class="fragment"><div class="line"><span class="keyword">using namespace </span><a class="code" href="namespacerapidjson.html">rapidjson</a>;</div>
<div class="line"><a class="code" href="classrapidjson_1_1_generic_document.html">Document</a> d; <span class="comment">// UTF8&lt;&gt;</span></div>
<div class="line"><span class="comment">// ...</span></div>
<div class="line"><a class="code" href="classrapidjson_1_1_generic_string_buffer.html">StringBuffer</a> buffer;</div>
<div class="line"><a class="code" href="classrapidjson_1_1_writer.html">Writer&lt;StringBuffer, Document::EncodingType, ASCII&lt;&gt;</a> &gt; writer(buffer);</div>
<div class="line">d.Accept(writer);</div>
<div class="line">std::cout &lt;&lt; buffer.GetString();</div>
</div><!-- fragment --><p>ASCII can be used in input stream. If the input stream contains bytes with values above 127, it will cause <code>kParseErrorStringInvalidEncoding</code> error.</p>
<p>ASCII <em>cannot</em> be used in memory (encoding of <code>Document</code> or target encoding of <code>Reader</code>), as it cannot represent Unicode code points.</p>
<h1><a class="anchor" id="ValidationTranscoding"></a>
Validation &amp; Transcoding</h1>
<p>When RapidJSON parses a JSON, it can validate the input JSON, whether it is a valid sequence of a specified encoding. This option can be turned on by adding <code>kParseValidateEncodingFlag</code> in <code>parseFlags</code> template parameter.</p>
<p>If the input encoding and output encoding is different, <code>Reader</code> and <code>Writer</code> will automatically transcode (convert) the text. In this case, <code>kParseValidateEncodingFlag</code> is not necessary, as it must decode the input sequence. And if the sequence was unable to be decoded, it must be invalid.</p>
<h2><a class="anchor" id="Transcoder"></a>
Transcoder</h2>
<p>Although the encoding functions in RapidJSON are designed for JSON parsing/generation, user may abuse them for transcoding of non-JSON strings.</p>
<p>Here is an example for transcoding a string from UTF-8 to UTF-16:</p>
<div class="fragment"><div class="line"><span class="preprocessor">#include &quot;rapidjson/encodings.h&quot;</span></div>
<div class="line"> </div>
<div class="line"><span class="keyword">using namespace </span><a class="code" href="namespacerapidjson.html">rapidjson</a>;</div>
<div class="line"> </div>
<div class="line"><span class="keyword">const</span> <span class="keywordtype">char</span>* s = <span class="stringliteral">&quot;...&quot;</span>; <span class="comment">// UTF-8 string</span></div>
<div class="line"><a class="code" href="structrapidjson_1_1_generic_string_stream.html">StringStream</a> source(s);</div>
<div class="line"><a class="code" href="classrapidjson_1_1_generic_string_buffer.html">GenericStringBuffer&lt;UTF16&lt;&gt;</a> &gt; target;</div>
<div class="line"> </div>
<div class="line"><span class="keywordtype">bool</span> hasError = <span class="keyword">false</span>;</div>
<div class="line"><span class="keywordflow">while</span> (source.Peek() != <span class="charliteral">&#39;\0&#39;</span>)</div>
<div class="line"> <span class="keywordflow">if</span> (!<a class="code" href="structrapidjson_1_1_transcoder.html">Transcoder</a>&lt;<a class="code" href="structrapidjson_1_1_u_t_f8.html">UTF8&lt;&gt;</a>, <a class="code" href="structrapidjson_1_1_u_t_f16.html">UTF16&lt;&gt;</a> &gt;::Transcode(source, target)) {</div>
<div class="line"> hasError = <span class="keyword">true</span>;</div>
<div class="line"> <span class="keywordflow">break</span>;</div>
<div class="line"> }</div>
<div class="line"> </div>
<div class="line"><span class="keywordflow">if</span> (!hasError) {</div>
<div class="line"> <span class="keyword">const</span> <span class="keywordtype">wchar_t</span>* t = target.GetString();</div>
<div class="line"> <span class="comment">// ...</span></div>
<div class="line">}</div>
</div><!-- fragment --><p>You may also use <code>AutoUTF</code> and the associated streams for setting source/target encoding in runtime. </p>
</div></div><!-- contents -->
</div><!-- PageDoc -->
</div><!-- doc-content -->
<div class="ttc" id="astructrapidjson_1_1_u_t_f16_html"><div class="ttname"><a href="structrapidjson_1_1_u_t_f16.html">rapidjson::UTF16</a></div><div class="ttdoc">UTF-16 encoding.</div><div class="ttdef"><b>Definition:</b> encodings.h:269</div></div>
<div class="ttc" id="aclassrapidjson_1_1_generic_string_buffer_html"><div class="ttname"><a href="classrapidjson_1_1_generic_string_buffer.html">rapidjson::GenericStringBuffer</a></div><div class="ttdoc">Represents an in-memory output stream.</div><div class="ttdef"><b>Definition:</b> fwd.h:59</div></div>
<div class="ttc" id="astructrapidjson_1_1_generic_string_stream_html"><div class="ttname"><a href="structrapidjson_1_1_generic_string_stream.html">rapidjson::GenericStringStream</a></div><div class="ttdoc">Read-only string stream.</div><div class="ttdef"><b>Definition:</b> fwd.h:47</div></div>
<div class="ttc" id="astructrapidjson_1_1_transcoder_html"><div class="ttname"><a href="structrapidjson_1_1_transcoder.html">rapidjson::Transcoder</a></div><div class="ttdoc">Encoding conversion.</div><div class="ttdef"><b>Definition:</b> encodings.h:658</div></div>
<div class="ttc" id="aclassrapidjson_1_1_generic_document_html"><div class="ttname"><a href="classrapidjson_1_1_generic_document.html">rapidjson::GenericDocument</a></div><div class="ttdoc">A document for parsing JSON text as DOM.</div><div class="ttdef"><b>Definition:</b> document.h:69</div></div>
<div class="ttc" id="aclassrapidjson_1_1_writer_html"><div class="ttname"><a href="classrapidjson_1_1_writer.html">rapidjson::Writer</a></div><div class="ttdoc">JSON writer.</div><div class="ttdef"><b>Definition:</b> fwd.h:95</div></div>
<div class="ttc" id="astructrapidjson_1_1_u_t_f8_html"><div class="ttname"><a href="structrapidjson_1_1_u_t_f8.html">rapidjson::UTF8</a></div><div class="ttdoc">UTF-8 encoding.</div><div class="ttdef"><b>Definition:</b> encodings.h:96</div></div>
<div class="ttc" id="anamespacerapidjson_html"><div class="ttname"><a href="namespacerapidjson.html">rapidjson</a></div><div class="ttdoc">main RapidJSON namespace</div><div class="ttdef"><b>Definition:</b> rapidjson.h:409</div></div>
<!-- HTML footer for doxygen 1.8.7-->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
</ul>
</div>
</body>
</html>