blob: 3cfa4020b50c487eaaa4a08b7375d3114dc6ce37 [file] [log] [blame] [edit]
//========================================================================
//
// CharCodeToUnicode.cc
//
// Copyright 2001-2003 Glyph & Cog, LLC
//
//========================================================================
//========================================================================
//
// Modified under the Poppler project - http://poppler.freedesktop.org
//
// All changes made under the Poppler project to this file are licensed
// under GPL version 2 or later
//
// Copyright (C) 2006, 2008-2010 Albert Astals Cid <aacid@kde.org>
// Copyright (C) 2007 Julien Rebetez <julienr@svn.gnome.org>
// Copyright (C) 2007 Koji Otani <sho@bbr.jp>
// Copyright (C) 2008 Michael Vrable <mvrable@cs.ucsd.edu>
// Copyright (C) 2008 Vasile Gaburici <gaburici@cs.umd.edu>
// Copyright (C) 2010 William Bader <williambader@hotmail.com>
// Copyright (C) 2010 Jakub Wilk <ubanus@users.sf.net>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
//
//========================================================================
#include <config.h>
#ifdef USE_GCC_PRAGMAS
#pragma implementation
#endif
#include <stdio.h>
#include <string.h>
#include "goo/gmem.h"
#include "goo/gfile.h"
#include "goo/GooLikely.h"
#include "goo/GooString.h"
#include "Error.h"
#include "GlobalParams.h"
#include "PSTokenizer.h"
#include "CharCodeToUnicode.h"
//------------------------------------------------------------------------
struct CharCodeToUnicodeString {
CharCode c;
Unicode *u;
int len;
};
//------------------------------------------------------------------------
static int getCharFromString(void *data) {
char *p;
int c;
p = *(char **)data;
if (*p) {
c = *p++;
*(char **)data = p;
} else {
c = EOF;
}
return c;
}
static int getCharFromFile(void *data) {
return fgetc((FILE *)data);
}
//------------------------------------------------------------------------
CharCodeToUnicode *CharCodeToUnicode::parseCIDToUnicode(GooString *fileName,
GooString *collection) {
FILE *f;
Unicode *mapA;
CharCode size, mapLenA;
char buf[64];
Unicode u;
CharCodeToUnicode *ctu;
if (!(f = fopen(fileName->getCString(), "r"))) {
error(-1, "Couldn't open cidToUnicode file '%s'",
fileName->getCString());
return NULL;
}
size = 32768;
mapA = (Unicode *)gmallocn(size, sizeof(Unicode));
mapLenA = 0;
while (getLine(buf, sizeof(buf), f)) {
if (mapLenA == size) {
size *= 2;
mapA = (Unicode *)greallocn(mapA, size, sizeof(Unicode));
}
if (sscanf(buf, "%x", &u) == 1) {
mapA[mapLenA] = u;
} else {
error(-1, "Bad line (%d) in cidToUnicode file '%s'",
(int)(mapLenA + 1), fileName->getCString());
mapA[mapLenA] = 0;
}
++mapLenA;
}
fclose(f);
ctu = new CharCodeToUnicode(collection->copy(), mapA, mapLenA, gTrue,
NULL, 0, 0);
gfree(mapA);
return ctu;
}
CharCodeToUnicode *CharCodeToUnicode::parseUnicodeToUnicode(
GooString *fileName) {
FILE *f;
Unicode *mapA;
CharCodeToUnicodeString *sMapA;
CharCode size, oldSize, len, sMapSizeA, sMapLenA;
char buf[256];
char *tok;
Unicode u0;
int uBufSize = 8;
Unicode *uBuf = (Unicode *)gmallocn(uBufSize, sizeof(Unicode));
CharCodeToUnicode *ctu;
int line, n, i;
char *tokptr;
if (!(f = fopen(fileName->getCString(), "r"))) {
gfree(uBuf);
error(-1, "Couldn't open unicodeToUnicode file '%s'",
fileName->getCString());
return NULL;
}
size = 4096;
mapA = (Unicode *)gmallocn(size, sizeof(Unicode));
memset(mapA, 0, size * sizeof(Unicode));
len = 0;
sMapA = NULL;
sMapSizeA = sMapLenA = 0;
line = 0;
while (getLine(buf, sizeof(buf), f)) {
++line;
if (!(tok = strtok_r(buf, " \t\r\n", &tokptr)) ||
sscanf(tok, "%x", &u0) != 1) {
error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
line, fileName->getCString());
continue;
}
n = 0;
while ((tok = strtok_r(NULL, " \t\r\n", &tokptr))) {
if (n >= uBufSize)
{
uBufSize += 8;
uBuf = (Unicode *)greallocn(uBuf, uBufSize, sizeof(Unicode));
}
if (sscanf(tok, "%x", &uBuf[n]) != 1) {
error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
line, fileName->getCString());
break;
}
++n;
}
if (n < 1) {
error(-1, "Bad line (%d) in unicodeToUnicode file '%s'",
line, fileName->getCString());
continue;
}
if (u0 >= size) {
oldSize = size;
while (u0 >= size) {
size *= 2;
}
mapA = (Unicode *)greallocn(mapA, size, sizeof(Unicode));
memset(mapA + oldSize, 0, (size - oldSize) * sizeof(Unicode));
}
if (n == 1) {
mapA[u0] = uBuf[0];
} else {
mapA[u0] = 0;
if (sMapLenA == sMapSizeA) {
sMapSizeA += 16;
sMapA = (CharCodeToUnicodeString *)
greallocn(sMapA, sMapSizeA, sizeof(CharCodeToUnicodeString));
}
sMapA[sMapLenA].c = u0;
sMapA[sMapLenA].u = (Unicode*)gmallocn(n, sizeof(Unicode));
for (i = 0; i < n; ++i) {
sMapA[sMapLenA].u[i] = uBuf[i];
}
sMapA[sMapLenA].len = n;
++sMapLenA;
}
if (u0 >= len) {
len = u0 + 1;
}
}
fclose(f);
ctu = new CharCodeToUnicode(fileName->copy(), mapA, len, gTrue,
sMapA, sMapLenA, sMapSizeA);
gfree(mapA);
gfree(uBuf);
return ctu;
}
CharCodeToUnicode *CharCodeToUnicode::make8BitToUnicode(Unicode *toUnicode) {
return new CharCodeToUnicode(NULL, toUnicode, 256, gTrue, NULL, 0, 0);
}
CharCodeToUnicode *CharCodeToUnicode::parseCMap(GooString *buf, int nBits) {
CharCodeToUnicode *ctu;
char *p;
ctu = new CharCodeToUnicode(NULL);
p = buf->getCString();
ctu->parseCMap1(&getCharFromString, &p, nBits);
return ctu;
}
CharCodeToUnicode *CharCodeToUnicode::parseCMapFromFile(GooString *fileName,
int nBits) {
CharCodeToUnicode *ctu;
FILE *f;
ctu = new CharCodeToUnicode(NULL);
if ((f = globalParams->findToUnicodeFile(fileName))) {
ctu->parseCMap1(&getCharFromFile, f, nBits);
fclose(f);
} else {
error(-1, "Couldn't find ToUnicode CMap file for '%s'",
fileName->getCString());
}
return ctu;
}
void CharCodeToUnicode::mergeCMap(GooString *buf, int nBits) {
char *p;
p = buf->getCString();
parseCMap1(&getCharFromString, &p, nBits);
}
void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data,
int nBits) {
PSTokenizer *pst;
char tok1[256], tok2[256], tok3[256];
int nDigits, n1, n2, n3;
CharCode i;
CharCode code1, code2;
GooString *name;
FILE *f;
nDigits = nBits / 4;
pst = new PSTokenizer(getCharFunc, data);
pst->getToken(tok1, sizeof(tok1), &n1);
while (pst->getToken(tok2, sizeof(tok2), &n2)) {
if (!strcmp(tok2, "usecmap")) {
if (tok1[0] == '/') {
name = new GooString(tok1 + 1);
if ((f = globalParams->findToUnicodeFile(name))) {
parseCMap1(&getCharFromFile, f, nBits);
fclose(f);
} else {
error(-1, "Couldn't find ToUnicode CMap file for '%s'",
name->getCString());
}
delete name;
}
pst->getToken(tok1, sizeof(tok1), &n1);
} else if (!strcmp(tok2, "beginbfchar")) {
while (pst->getToken(tok1, sizeof(tok1), &n1)) {
if (!strcmp(tok1, "endbfchar")) {
break;
}
if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
!strcmp(tok2, "endbfchar")) {
error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
break;
}
if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
tok2[0] == '<' && tok2[n2 - 1] == '>')) {
if (!(n1 == 4 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' && tok1[1] == '0' && tok1[2] == '0' &&
tok2[0] == '<' && tok2[n2 - 1] == '>')) {
error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
continue;
}
}
tok1[n1 - 1] = tok2[n2 - 1] = '\0';
if (sscanf(tok1 + 1, "%x", &code1) != 1) {
error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
continue;
}
addMapping(code1, tok2 + 1, n2 - 2, 0);
}
pst->getToken(tok1, sizeof(tok1), &n1);
} else if (!strcmp(tok2, "beginbfrange")) {
while (pst->getToken(tok1, sizeof(tok1), &n1)) {
if (!strcmp(tok1, "endbfrange")) {
break;
}
if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
!strcmp(tok2, "endbfrange") ||
!pst->getToken(tok3, sizeof(tok3), &n3) ||
!strcmp(tok3, "endbfrange")) {
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
break;
}
if (!(((n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>') ||
(n1 == 4 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' && tok1[1] == '0' && tok1[2] == '0')) &&
((n2 == 2 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>') ||
(n2 == 4 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>' && tok1[1] == '0' && tok1[2] == '0')))) {
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
continue;
}
tok1[n1 - 1] = tok2[n2 - 1] = '\0';
if (sscanf(tok1 + 1, "%x", &code1) != 1 ||
sscanf(tok2 + 1, "%x", &code2) != 1) {
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
continue;
}
if (!strcmp(tok3, "[")) {
i = 0;
while (pst->getToken(tok1, sizeof(tok1), &n1) &&
code1 + i <= code2) {
if (!strcmp(tok1, "]")) {
break;
}
if (tok1[0] == '<' && tok1[n1 - 1] == '>') {
tok1[n1 - 1] = '\0';
addMapping(code1 + i, tok1 + 1, n1 - 2, 0);
} else {
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
}
++i;
}
} else if (tok3[0] == '<' && tok3[n3 - 1] == '>') {
tok3[n3 - 1] = '\0';
for (i = 0; code1 <= code2; ++code1, ++i) {
addMapping(code1, tok3 + 1, n3 - 2, i);
}
} else {
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
}
}
pst->getToken(tok1, sizeof(tok1), &n1);
} else {
strcpy(tok1, tok2);
}
}
delete pst;
}
void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n,
int offset) {
CharCode oldLen, i;
Unicode u;
char uHex[5];
int j;
if (code >= mapLen) {
oldLen = mapLen;
mapLen = (code + 256) & ~255;
if (unlikely(code >= mapLen)) {
error(-1, "Illegal code value in CharCodeToUnicode::addMapping");
return;
} else {
map = (Unicode *)greallocn(map, mapLen, sizeof(Unicode));
for (i = oldLen; i < mapLen; ++i) {
map[i] = 0;
}
}
}
if (n <= 4) {
if (sscanf(uStr, "%x", &u) != 1) {
error(-1, "Illegal entry in ToUnicode CMap");
return;
}
map[code] = u + offset;
} else {
if (sMapLen >= sMapSize) {
sMapSize = sMapSize + 16;
sMap = (CharCodeToUnicodeString *)
greallocn(sMap, sMapSize, sizeof(CharCodeToUnicodeString));
}
map[code] = 0;
sMap[sMapLen].c = code;
sMap[sMapLen].len = n / 4;
sMap[sMapLen].u = (Unicode*)gmallocn(sMap[sMapLen].len, sizeof(Unicode));
for (j = 0; j < sMap[sMapLen].len; ++j) {
strncpy(uHex, uStr + j*4, 4);
uHex[4] = '\0';
if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
error(-1, "Illegal entry in ToUnicode CMap");
}
}
sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset;
++sMapLen;
}
}
CharCodeToUnicode::CharCodeToUnicode(GooString *tagA) {
CharCode i;
tag = tagA;
mapLen = 256;
map = (Unicode *)gmallocn(mapLen, sizeof(Unicode));
for (i = 0; i < mapLen; ++i) {
map[i] = 0;
}
sMap = NULL;
sMapLen = sMapSize = 0;
refCnt = 1;
#if MULTITHREADED
gInitMutex(&mutex);
#endif
}
CharCodeToUnicode::CharCodeToUnicode(GooString *tagA, Unicode *mapA,
CharCode mapLenA, GBool copyMap,
CharCodeToUnicodeString *sMapA,
int sMapLenA, int sMapSizeA) {
tag = tagA;
mapLen = mapLenA;
if (copyMap) {
map = (Unicode *)gmallocn(mapLen, sizeof(Unicode));
memcpy(map, mapA, mapLen * sizeof(Unicode));
} else {
map = mapA;
}
sMap = sMapA;
sMapLen = sMapLenA;
sMapSize = sMapSizeA;
refCnt = 1;
#if MULTITHREADED
gInitMutex(&mutex);
#endif
}
CharCodeToUnicode::~CharCodeToUnicode() {
if (tag) {
delete tag;
}
gfree(map);
if (sMap) {
for (int i = 0; i < sMapLen; ++i) gfree(sMap[i].u);
gfree(sMap);
}
#if MULTITHREADED
gDestroyMutex(&mutex);
#endif
}
void CharCodeToUnicode::incRefCnt() {
#if MULTITHREADED
gLockMutex(&mutex);
#endif
++refCnt;
#if MULTITHREADED
gUnlockMutex(&mutex);
#endif
}
void CharCodeToUnicode::decRefCnt() {
GBool done;
#if MULTITHREADED
gLockMutex(&mutex);
#endif
done = --refCnt == 0;
#if MULTITHREADED
gUnlockMutex(&mutex);
#endif
if (done) {
delete this;
}
}
GBool CharCodeToUnicode::match(GooString *tagA) {
return tag && !tag->cmp(tagA);
}
void CharCodeToUnicode::setMapping(CharCode c, Unicode *u, int len) {
int i, j;
if (len == 1) {
map[c] = u[0];
} else {
for (i = 0; i < sMapLen; ++i) {
if (sMap[i].c == c) {
gfree(sMap[i].u);
break;
}
}
if (i == sMapLen) {
if (sMapLen == sMapSize) {
sMapSize += 8;
sMap = (CharCodeToUnicodeString *)
greallocn(sMap, sMapSize, sizeof(CharCodeToUnicodeString));
}
++sMapLen;
}
map[c] = 0;
sMap[i].c = c;
sMap[i].len = len;
sMap[i].u = (Unicode*)gmallocn(len, sizeof(Unicode));
for (j = 0; j < len; ++j) {
sMap[i].u[j] = u[j];
}
}
}
int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode **u) {
int i;
if (c >= mapLen) {
return 0;
}
if (map[c]) {
*u = &map[c];
return 1;
}
for (i = sMapLen - 1; i >= 0; --i) { // in reverse so CMap takes precedence
if (sMap[i].c == c) {
*u = sMap[i].u;
return sMap[i].len;
}
}
return 0;
}
int CharCodeToUnicode::mapToCharCode(Unicode* u, CharCode *c, int usize) {
//look for charcode in map
if (usize == 1) {
for (CharCode i=0; i<mapLen; i++) {
if (map[i] == *u) {
*c = i;
return 1;
}
}
*c = 'x';
} else {
int i, j;
//for each entry in the sMap
for (i=0; i<sMapLen; i++) {
//if the entry's unicode length isn't the same are usize, the strings
// are obviously differents
if (sMap[i].len != usize) continue;
//compare the string char by char
for (j=0; j<sMap[i].len; j++) {
if (sMap[i].u[j] != u[j]) {
continue;
}
}
//we have the same strings
if (j==sMap[i].len) {
*c = sMap[i].c;
return 1;
}
}
}
return 0;
}
//------------------------------------------------------------------------
CharCodeToUnicodeCache::CharCodeToUnicodeCache(int sizeA) {
int i;
size = sizeA;
cache = (CharCodeToUnicode **)gmallocn(size, sizeof(CharCodeToUnicode *));
for (i = 0; i < size; ++i) {
cache[i] = NULL;
}
}
CharCodeToUnicodeCache::~CharCodeToUnicodeCache() {
int i;
for (i = 0; i < size; ++i) {
if (cache[i]) {
cache[i]->decRefCnt();
}
}
gfree(cache);
}
CharCodeToUnicode *CharCodeToUnicodeCache::getCharCodeToUnicode(GooString *tag) {
CharCodeToUnicode *ctu;
int i, j;
if (cache[0] && cache[0]->match(tag)) {
cache[0]->incRefCnt();
return cache[0];
}
for (i = 1; i < size; ++i) {
if (cache[i] && cache[i]->match(tag)) {
ctu = cache[i];
for (j = i; j >= 1; --j) {
cache[j] = cache[j - 1];
}
cache[0] = ctu;
ctu->incRefCnt();
return ctu;
}
}
return NULL;
}
void CharCodeToUnicodeCache::add(CharCodeToUnicode *ctu) {
int i;
if (cache[size - 1]) {
cache[size - 1]->decRefCnt();
}
for (i = size - 1; i >= 1; --i) {
cache[i] = cache[i - 1];
}
cache[0] = ctu;
ctu->incRefCnt();
}