blob: 9097b312cdcd53639f7d51e631a7bea096cd7ca1 [file] [log] [blame]
//========================================================================
//
// UTF.cc
//
// Copyright 2001-2003 Glyph & Cog, LLC
//
//========================================================================
//========================================================================
//
// Modified under the Poppler project - http://poppler.freedesktop.org
//
// All changes made under the Poppler project to this file are licensed
// under GPL version 2 or later
//
// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
// Copyright (C) 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
// Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
// Copyright (C) 2016, 2018-2020 Albert Astals Cid <aacid@kde.org>
// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
// Copyright (C) 2018, 2020 Nelson Benítez León <nbenitezl@gmail.com>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
//
//========================================================================
#include "goo/gmem.h"
#include "PDFDocEncoding.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
#include "UTF.h"
#include "UnicodeMapFuncs.h"
#include <algorithm>
bool UnicodeIsValid(Unicode ucs4)
{
return (ucs4 < 0x110000) && ((ucs4 & 0xfffff800) != 0xd800) && (ucs4 < 0xfdd0 || ucs4 > 0xfdef) && ((ucs4 & 0xfffe) != 0xfffe);
}
int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4_out)
{
int i, n, len;
Unicode *u;
// count characters
len = 0;
for (i = 0; i < utf16Len; i++) {
if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) {
i++; /* surrogate pair */
}
len++;
}
if (ucs4_out == nullptr)
return len;
u = (Unicode *)gmallocn(len, sizeof(Unicode));
n = 0;
// convert string
for (i = 0; i < utf16Len; i++) {
if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
if (i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) {
/* next code is a low surrogate */
u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i + 1] & 0x3ff)) + 0x10000;
++i;
} else {
/* missing low surrogate
replace it with REPLACEMENT CHARACTER (U+FFFD) */
u[n] = 0xfffd;
}
} else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
/* invalid low surrogate
replace it with REPLACEMENT CHARACTER (U+FFFD) */
u[n] = 0xfffd;
} else {
u[n] = utf16[i];
}
if (!UnicodeIsValid(u[n])) {
u[n] = 0xfffd;
}
n++;
}
*ucs4_out = u;
return len;
}
int TextStringToUCS4(const GooString *textStr, Unicode **ucs4)
{
int i, len;
const char *s;
Unicode *u;
bool isUnicode, isUnicodeLE;
len = textStr->getLength();
s = textStr->c_str();
if (len == 0) {
*ucs4 = nullptr;
return 0;
}
if (textStr->hasUnicodeMarker()) {
isUnicode = true;
isUnicodeLE = false;
} else if (textStr->hasUnicodeMarkerLE()) {
isUnicode = false;
isUnicodeLE = true;
} else {
isUnicode = false;
isUnicodeLE = false;
}
if (isUnicode || isUnicodeLE) {
Unicode *utf16;
len = len / 2 - 1;
if (len > 0) {
utf16 = new Unicode[len];
for (i = 0; i < len; i++) {
if (isUnicode)
utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
else // UnicodeLE
utf16[i] = (s[3 + i * 2] & 0xff) << 8 | (s[2 + i * 2] & 0xff);
}
len = UTF16toUCS4(utf16, len, &u);
delete[] utf16;
} else {
u = nullptr;
}
} else {
u = (Unicode *)gmallocn(len, sizeof(Unicode));
for (i = 0; i < len; i++) {
u[i] = pdfDocEncoding[s[i] & 0xff];
}
}
*ucs4 = u;
return len;
}
bool UnicodeIsWhitespace(Unicode ucs4)
{
static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 };
Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]);
Unicode const *i = std::lower_bound(spaces, end, ucs4);
return (i != end && *i == ucs4);
}
//
// decodeUtf8() and decodeUtf8Table are:
//
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies
// of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
//
static const uint32_t UTF8_ACCEPT = 0;
static const uint32_t UTF8_REJECT = 12;
static const uint32_t UCS4_MAX = 0x10FFFF;
static const Unicode REPLACEMENT_CHAR = 0xFFFD;
static const uint8_t decodeUtf8Table[] = {
// The first part of the table maps bytes to character classes
// to reduce the size of the transition table and create bitmasks.
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0, // 00..1f
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0, // 20..3f
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0, // 40..5f
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0, // 60..7f
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9, // 80..9f
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7, // a0..bf
8,
8,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2, // c0..df
10,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
4,
3,
3,
11,
6,
6,
6,
5,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8, // e0..ff
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,
12,
24,
36,
60,
96,
84,
12,
12,
12,
48,
72,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
0,
12,
12,
12,
12,
12,
0,
12,
0,
12,
12,
12,
24,
12,
12,
12,
12,
12,
24,
12,
24,
12,
12,
12,
12,
12,
12,
12,
12,
12,
24,
12,
12,
12,
12,
12,
24,
12,
12,
12,
12,
12,
12,
12,
24,
12,
12,
12,
12,
12,
12,
12,
12,
12,
36,
12,
36,
12,
12,
12,
36,
12,
12,
12,
12,
12,
36,
12,
36,
12,
12,
12,
36,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
};
// Decode utf8 state machine for fast UTF-8 decoding. Initialise state
// to 0 and call decodeUtf8() for each byte of UTF-8. Return value
// (and state) is UTF8_ACCEPT when it has found a valid codepoint
// (codepoint returned in codep), UTF8_REJECT when the byte is not
// allowed to occur at its position, and some other positive value if
// more bytes have to be read. Reset state to 0 to recover from
// errors.
inline uint32_t decodeUtf8(uint32_t *state, uint32_t *codep, char byte)
{
uint32_t b = (unsigned char)byte;
uint32_t type = decodeUtf8Table[b];
*codep = (*state != UTF8_ACCEPT) ? (b & 0x3fu) | (*codep << 6) : (0xff >> type) & (b);
*state = decodeUtf8Table[256 + *state + type];
return *state;
}
// Count number of UTF-16 code units required to convert a UTF-8 string
// (excluding terminating NULL). Each invalid byte is counted as a
// code point since the UTF-8 conversion functions will replace it with
// REPLACEMENT_CHAR.
int utf8CountUtf16CodeUnits(const char *utf8)
{
uint32_t codepoint;
uint32_t state = 0;
int count = 0;
while (*utf8) {
decodeUtf8(&state, &codepoint, *utf8);
if (state == UTF8_ACCEPT) {
if (codepoint < 0x10000)
count++;
else if (codepoint <= UCS4_MAX)
count += 2;
else
count++; // replace with REPLACEMENT_CHAR
} else if (state == UTF8_REJECT) {
count++; // replace with REPLACEMENT_CHAR
state = 0;
}
utf8++;
}
if (state != UTF8_ACCEPT && state != UTF8_REJECT)
count++; // replace with REPLACEMENT_CHAR
return count;
}
// Convert UTF-8 to UTF-16
// utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
// bytes to convert
// utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
// maxUtf16 - maximum size of output buffer including space for null.
// maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
// either this count is reached or a null is encountered.
// Returns number of UTF-16 code units written (excluding NULL).
int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8)
{
uint16_t *p = utf16;
uint32_t codepoint;
uint32_t state = 0;
int nIn = 0;
int nOut = 0;
while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) {
decodeUtf8(&state, &codepoint, *utf8);
if (state == UTF8_ACCEPT) {
if (codepoint < 0x10000) {
*p++ = (uint16_t)codepoint;
nOut++;
} else if (codepoint <= UCS4_MAX) {
*p++ = (uint16_t)(0xD7C0 + (codepoint >> 10));
*p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF));
nOut += 2;
} else {
*p++ = REPLACEMENT_CHAR;
nOut++;
state = 0;
}
} else if (state == UTF8_REJECT) {
*p++ = REPLACEMENT_CHAR; // invalid byte for this position
nOut++;
}
utf8++;
nIn++;
}
// replace any trailing bytes too short for a valid UTF-8 with a replacement char
if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) {
*p++ = REPLACEMENT_CHAR;
nOut++;
}
if (nOut > maxUtf16 - 1)
nOut = maxUtf16 - 1;
utf16[nOut] = 0;
return nOut;
}
// Allocate utf16 string and convert utf8 into it.
uint16_t *utf8ToUtf16(const char *utf8, int *len)
{
int n = utf8CountUtf16CodeUnits(utf8);
if (len)
*len = n;
uint16_t *utf16 = (uint16_t *)gmallocn(n + 1, sizeof(uint16_t));
utf8ToUtf16(utf8, utf16);
return utf16;
}
static const uint32_t UTF16_ACCEPT = 0;
static const uint32_t UTF16_REJECT = -1;
// Initialise state to 0. Returns UTF16_ACCEPT when a valid code point
// has been found, UTF16_REJECT when invalid code unit for this state,
// some other valid if another code unit needs to be read.
inline uint32_t decodeUtf16(uint32_t *state, uint32_t *codePoint, uint16_t codeUnit)
{
if (*state == 0) {
if (codeUnit >= 0xd800 && codeUnit < 0xdc00) { /* surrogate pair */
*state = codeUnit;
return *state;
} else if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
/* invalid low surrogate */
return UTF16_REJECT;
} else {
*codePoint = codeUnit;
return UTF16_ACCEPT;
}
} else {
if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
*codePoint = (((*state & 0x3ff) << 10) | (codeUnit & 0x3ff)) + 0x10000;
*state = 0;
return UTF16_ACCEPT;
} else {
/* invalid high surrogate */
return UTF16_REJECT;
}
}
}
// Count number of UTF-8 bytes required to convert a UTF-16 string to
// UTF-8 (excluding terminating NULL).
int utf16CountUtf8Bytes(const uint16_t *utf16)
{
uint32_t codepoint = 0;
uint32_t state = 0;
int count = 0;
while (*utf16) {
decodeUtf16(&state, &codepoint, *utf16);
if (state == UTF16_ACCEPT) {
if (codepoint < 0x80)
count++;
else if (codepoint < 0x800)
count += 2;
else if (codepoint < 0x10000)
count += 3;
else if (codepoint <= UCS4_MAX)
count += 4;
else
count += 3; // replace with REPLACEMENT_CHAR
} else if (state == UTF16_REJECT) {
count += 3; // replace with REPLACEMENT_CHAR
state = 0;
}
utf16++;
}
if (state != UTF8_ACCEPT && state != UTF8_REJECT)
count++; // replace with REPLACEMENT_CHAR
return count;
}
// Convert UTF-16 to UTF-8
// utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
// code units to convert
// utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
// maxUtf8 - maximum size of output buffer including space for null.
// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
// either this count is reached or a null is encountered.
// Returns number of UTF-8 bytes written (excluding NULL).
int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16)
{
uint32_t codepoint = 0;
uint32_t state = 0;
int nIn = 0;
int nOut = 0;
char *p = utf8;
while (*utf16 && nIn < maxUtf16 && nOut < maxUtf8 - 1) {
decodeUtf16(&state, &codepoint, *utf16);
if (state == UTF16_ACCEPT || state == UTF16_REJECT) {
if (state == UTF16_REJECT || codepoint > UCS4_MAX) {
codepoint = REPLACEMENT_CHAR;
state = 0;
}
int bufSize = maxUtf8 - nOut;
int count = mapUTF8(codepoint, p, bufSize);
p += count;
nOut += count;
}
utf16++;
nIn++;
}
// replace any trailing bytes too short for a valid UTF-8 with a replacement char
if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) {
int bufSize = maxUtf8 - nOut;
int count = mapUTF8(REPLACEMENT_CHAR, p, bufSize);
p += count;
nOut += count;
nOut++;
}
if (nOut > maxUtf8 - 1)
nOut = maxUtf8 - 1;
utf8[nOut] = 0;
return nOut;
}
// Allocate utf8 string and convert utf16 into it.
char *utf16ToUtf8(const uint16_t *utf16, int *len)
{
int n = utf16CountUtf8Bytes(utf16);
if (len)
*len = n;
char *utf8 = (char *)gmalloc(n + 1);
utf16ToUtf8(utf16, utf8);
return utf8;
}
void unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices)
{
const UnicodeMap *uMap = globalParams->getUnicodeMap("ASCII7");
int *idx = nullptr;
if (!len) {
*ucs4_out = nullptr;
*out_len = 0;
return;
}
if (indices) {
if (!in_idx)
indices = nullptr;
else
idx = (int *)gmallocn(len * 8 + 1, sizeof(int));
}
GooString gstr;
char buf[8]; // 8 is enough for mapping an unicode char to a string
int i, n, k;
for (i = k = 0; i < len; ++i) {
n = uMap->mapUnicode(in[i], buf, sizeof(buf));
if (!n) {
// the Unicode char could not be converted to ascii7 counterpart
// so just fill with a non-printable ascii char
buf[0] = 31;
n = 1;
}
gstr.append(buf, n);
if (indices) {
for (; n > 0; n--)
idx[k++] = in_idx[i];
}
}
*out_len = TextStringToUCS4(&gstr, ucs4_out);
if (indices) {
idx[k] = in_idx[len];
*indices = idx;
}
}