blob: f7b02d14795d30029ddbf322e91fd5cff984dcd8 [file] [log] [blame]
//========================================================================
//
// UTF.h
//
// Copyright 2001-2003 Glyph & Cog, LLC
//
//========================================================================
//========================================================================
//
// Modified under the Poppler project - http://poppler.freedesktop.org
//
// All changes made under the Poppler project to this file are licensed
// under GPL version 2 or later
//
// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
// Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
// Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
// Copyright (C) 2016 Albert Astals Cid <aacid@kde.org>
// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
//
//========================================================================
#include "goo/gmem.h"
#include "PDFDocEncoding.h"
#include "UTF.h"
#include <algorithm>
bool UnicodeIsValid(Unicode ucs4)
{
return (ucs4 < 0x110000) &&
((ucs4 & 0xfffff800) != 0xd800) &&
(ucs4 < 0xfdd0 || ucs4 > 0xfdef) &&
((ucs4 & 0xfffe) != 0xfffe);
}
int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
{
int i, n, len;
Unicode *u;
// count characters
len = 0;
for (i = 0; i < utf16Len; i++) {
if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len &&
utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
i++; /* surrogate pair */
}
len++;
}
if (ucs4 == NULL)
return len;
u = (Unicode*)gmallocn(len, sizeof(Unicode));
n = 0;
// convert string
for (i = 0; i < utf16Len; i++) {
if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
/* next code is a low surrogate */
u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000;
++i;
} else {
/* missing low surrogate
replace it with REPLACEMENT CHARACTER (U+FFFD) */
u[n] = 0xfffd;
}
} else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
/* invalid low surrogate
replace it with REPLACEMENT CHARACTER (U+FFFD) */
u[n] = 0xfffd;
} else {
u[n] = utf16[i];
}
if (!UnicodeIsValid(u[n])) {
u[n] = 0xfffd;
}
n++;
}
*ucs4 = u;
return len;
}
int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
{
int i, len;
const char *s;
Unicode *u;
len = textStr->getLength();
s = textStr->getCString();
if (len == 0) {
*ucs4 = 0;
return 0;
}
if (textStr->hasUnicodeMarker()) {
Unicode *utf16;
len = len/2 - 1;
if (len > 0) {
utf16 = new Unicode[len];
for (i = 0 ; i < len; i++) {
utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
}
len = UTF16toUCS4(utf16, len, &u);
delete[] utf16;
} else {
u = NULL;
}
} else {
u = (Unicode*)gmallocn(len, sizeof(Unicode));
for (i = 0 ; i < len; i++) {
u[i] = pdfDocEncoding[s[i] & 0xff];
}
}
*ucs4 = u;
return len;
}
bool UnicodeIsWhitespace(Unicode ucs4)
{
static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D,
0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F,
0x3000 };
Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]);
Unicode const *i = std::lower_bound(spaces, end, ucs4);
return (i != end && *i == ucs4);
}