blob: 65ded8cbbe769f40ec223d1a020bc7f0cb06c555 [file] [log] [blame]
//========================================================================
//
// TextOutputDev.cc
//
// Copyright 1997-2003 Glyph & Cog, LLC
//
//========================================================================
//========================================================================
//
// Modified under the Poppler project - http://poppler.freedesktop.org
//
// All changes made under the Poppler project to this file are licensed
// under GPL version 2 or later
//
// Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
// Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
// Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
// Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
// Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
// Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
// Copyright (C) 2008, 2010-2012, 2014-2019 Albert Astals Cid <aacid@kde.org>
// Copyright (C) 2008 Pino Toscano <pino@kde.org>
// Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
// Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
// Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
// Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
// Copyright (C) 2010 Marek Kasik <mkasik@redhat.com>
// Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
// Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
// Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
// Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us>
// Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
// Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
// Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
// Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
// Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
// Copyright (C) 2018, 2019 Nelson Benítez León <nbenitezl@gmail.com>
// Copyright (C) 2019 Christian Persch <chpe@src.gnome.org>
// Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
// Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
//
//========================================================================
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <cmath>
#include <float.h>
#include <ctype.h>
#include <algorithm>
#ifdef _WIN32
#include <fcntl.h> // for O_BINARY
#include <io.h> // for setmode
#endif
#include "goo/gfile.h"
#include "goo/gmem.h"
#include "goo/GooString.h"
#include "poppler-config.h"
#include "Error.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
#include "UnicodeTypeTable.h"
#include "Link.h"
#include "TextOutputDev.h"
#include "Page.h"
#include "Annot.h"
#include "UTF.h"
//------------------------------------------------------------------------
// parameters
//------------------------------------------------------------------------
// Each bucket in a text pool includes baselines within a range of
// this many points.
#define textPoolStep 4
// Inter-character space width which will cause addChar to start a new
// word.
#define minWordBreakSpace 0.1
// Negative inter-character space width, i.e., overlap, which will
// cause addChar to start a new word.
#define minDupBreakOverlap 0.2
// Max distance between baselines of two lines within a block, as a
// fraction of the font size.
#define maxLineSpacingDelta 1.5
// Max difference in primary font sizes on two lines in the same
// block. Delta1 is used when examining new lines above and below the
// current block; delta2 is used when examining text that overlaps the
// current block; delta3 is used when examining text to the left and
// right of the current block.
#define maxBlockFontSizeDelta1 0.05
#define maxBlockFontSizeDelta2 0.6
#define maxBlockFontSizeDelta3 0.2
// Max difference in font sizes inside a word.
#define maxWordFontSizeDelta 0.05
// Maximum distance between baselines of two words on the same line,
// e.g., distance between subscript or superscript and the primary
// baseline, as a fraction of the font size.
#define maxIntraLineDelta 0.5
// Minimum inter-word spacing, as a fraction of the font size. (Only
// used for raw ordering.)
#define minWordSpacing 0.15
// Maximum inter-word spacing, as a fraction of the font size.
#define maxWordSpacing 1.5
// Maximum horizontal spacing which will allow a word to be pulled
// into a block.
#define minColSpacing1 0.3
// Minimum spacing between columns, as a fraction of the font size.
#define minColSpacing2 1.0
// Maximum vertical spacing between blocks within a flow, as a
// multiple of the font size.
#define maxBlockSpacing 2.5
// Minimum spacing between characters within a word, as a fraction of
// the font size.
#define minCharSpacing -0.5
// Maximum spacing between characters within a word, as a fraction of
// the font size, when there is no obvious extra-wide character
// spacing.
#define maxCharSpacing 0.03
// When extra-wide character spacing is detected, the inter-character
// space threshold is set to the minimum inter-character space
// multiplied by this constant.
#define maxWideCharSpacingMul 1.3
// Upper limit on spacing between characters in a word.
#define maxWideCharSpacing 0.4
// Max difference in primary,secondary coordinates (as a fraction of
// the font size) allowed for duplicated text (fake boldface, drop
// shadows) which is to be discarded.
#define dupMaxPriDelta 0.1
#define dupMaxSecDelta 0.2
// Max width of underlines (in points).
#define maxUnderlineWidth 3
// Min distance between baseline and underline (in points).
//~ this should be font-size-dependent
#define minUnderlineGap -2
// Max distance between baseline and underline (in points).
//~ this should be font-size-dependent
#define maxUnderlineGap 4
// Max horizontal distance between edge of word and start of underline
// (in points).
//~ this should be font-size-dependent
#define underlineSlack 1
// Max distance between edge of text and edge of link border
#define hyperlinkSlack 2
// Max distance between characters when combining a base character and
// combining character
#define combMaxMidDelta 0.3
#define combMaxBaseDelta 0.4
// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
// (Or 1/tan(angle) for 90/270 degrees.)
#define diagonalThreshold 0.1
namespace {
inline bool isAscii7 (Unicode uchar) {
return uchar < 128;
}
}
static int reorderText(Unicode *text, int len, UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode* u) {
char lre[8], rle[8], popdf[8], buf[8];
int lreLen = 0, rleLen = 0, popdfLen = 0, n;
int nCols, i, j, k;
nCols = 0;
if (s) {
lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
}
if (primaryLR) {
i = 0;
while (i < len) {
// output a left-to-right section
for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
for (k = i; k < j; ++k) {
if (s) {
n = uMap->mapUnicode(text[k], buf, sizeof(buf));
s->append(buf, n);
}
if (u) u[nCols] = text[k];
++nCols;
}
i = j;
// output a right-to-left section
for (j = i;
j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
++j) ;
if (j > i) {
if (s) s->append(rle, rleLen);
for (k = j - 1; k >= i; --k) {
if (s) {
n = uMap->mapUnicode(text[k], buf, sizeof(buf));
s->append(buf, n);
}
if (u) u[nCols] = text[k];
++nCols;
}
if (s) s->append(popdf, popdfLen);
i = j;
}
}
} else {
// Note: This code treats numeric characters (European and
// Arabic/Indic) as left-to-right, which isn't strictly correct
// (incurs extra LRE/POPDF pairs), but does produce correct
// visual formatting.
if (s) s->append(rle, rleLen);
i = len - 1;
while (i >= 0) {
// output a right-to-left section
for (j = i;
j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
--j) ;
for (k = i; k > j; --k) {
if (s) {
n = uMap->mapUnicode(text[k], buf, sizeof(buf));
s->append(buf, n);
}
if (u) u[nCols] = text[k];
++nCols;
}
i = j;
// output a left-to-right section
for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
if (j < i) {
if (s) s->append(lre, lreLen);
for (k = j + 1; k <= i; ++k) {
if (s) {
n = uMap->mapUnicode(text[k], buf, sizeof(buf));
s->append(buf, n);
}
if (u) u[nCols] = text[k];
++nCols;
}
if (s) s->append(popdf, popdfLen);
i = j;
}
}
if (s) s->append(popdf, popdfLen);
}
return nCols;
}
//------------------------------------------------------------------------
// TextUnderline
//------------------------------------------------------------------------
class TextUnderline {
public:
TextUnderline(double x0A, double y0A, double x1A, double y1A)
{ x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
~TextUnderline() {}
double x0, y0, x1, y1;
bool horiz;
};
//------------------------------------------------------------------------
// TextLink
//------------------------------------------------------------------------
class TextLink {
public:
TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA)
{ xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; link = linkA; }
~TextLink() {}
int xMin, yMin, xMax, yMax;
AnnotLink *link;
};
//------------------------------------------------------------------------
// TextFontInfo
//------------------------------------------------------------------------
TextFontInfo::TextFontInfo(GfxState *state) {
gfxFont = state->getFont();
if (gfxFont)
gfxFont->incRefCnt();
#ifdef TEXTOUT_WORD_LIST
fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy()
: nullptr;
flags = gfxFont ? gfxFont->getFlags() : 0;
#endif
}
TextFontInfo::~TextFontInfo() {
if (gfxFont)
gfxFont->decRefCnt();
#ifdef TEXTOUT_WORD_LIST
if (fontName) {
delete fontName;
}
#endif
}
bool TextFontInfo::matches(GfxState *state) const {
return state->getFont() == gfxFont;
}
bool TextFontInfo::matches(const TextFontInfo *fontInfo) const {
return gfxFont == fontInfo->gfxFont;
}
double TextFontInfo::getAscent() const {
return gfxFont ? gfxFont->getAscent() : 0.95;
}
double TextFontInfo::getDescent() const {
return gfxFont ? gfxFont->getDescent() : -0.35;
}
int TextFontInfo::getWMode() const {
return gfxFont ? gfxFont->getWMode() : 0;
}
//------------------------------------------------------------------------
// TextWord
//------------------------------------------------------------------------
TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA) {
rot = rotA;
fontSize = fontSizeA;
text = nullptr;
charcode = nullptr;
edge = nullptr;
charPos = nullptr;
font = nullptr;
textMat = nullptr;
len = size = 0;
spaceAfter = false;
next = nullptr;
#ifdef TEXTOUT_WORD_LIST
GfxRGB rgb;
if ((state->getRender() & 3) == 1) {
state->getStrokeRGB(&rgb);
} else {
state->getFillRGB(&rgb);
}
colorR = colToDbl(rgb.r);
colorG = colToDbl(rgb.g);
colorB = colToDbl(rgb.b);
#endif
underlined = false;
link = nullptr;
}
TextWord::~TextWord() {
gfree(text);
gfree(charcode);
gfree(edge);
gfree(charPos);
gfree(font);
gfree(textMat);
}
void TextWord::addChar(GfxState *state, TextFontInfo *fontA, double x, double y,
double dx, double dy, int charPosA, int charLen,
CharCode c, Unicode u, const Matrix &textMatA) {
ensureCapacity(len+1);
text[len] = u;
charcode[len] = c;
charPos[len] = charPosA;
charPos[len + 1] = charPosA + charLen;
font[len] = fontA;
textMat[len] = textMatA;
if (len == 0)
setInitialBounds(fontA, x, y);
if (wMode) { // vertical writing mode
// NB: the rotation value has been incremented by 1 (in
// TextPage::beginWord()) for vertical writing mode
switch (rot) {
case 0:
edge[len] = x - fontSize;
xMax = edge[len+1] = x;
break;
case 1:
edge[len] = y - fontSize;
yMax = edge[len+1] = y;
break;
case 2:
edge[len] = x + fontSize;
xMin = edge[len+1] = x;
break;
case 3:
edge[len] = y + fontSize;
yMin = edge[len+1] = y;
break;
}
} else { // horizontal writing mode
switch (rot) {
case 0:
edge[len] = x;
xMax = edge[len+1] = x + dx;
break;
case 1:
edge[len] = y;
yMax = edge[len+1] = y + dy;
break;
case 2:
edge[len] = x;
xMin = edge[len+1] = x + dx;
break;
case 3:
edge[len] = y;
yMin = edge[len+1] = y + dy;
break;
}
}
++len;
}
void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y) {
double ascent = fontA->getAscent() * fontSize;
double descent = fontA->getDescent() * fontSize;
wMode = fontA->getWMode();
if (wMode) { // vertical writing mode
// NB: the rotation value has been incremented by 1 (in
// TextPage::beginWord()) for vertical writing mode
switch (rot) {
case 0:
xMin = x - fontSize;
yMin = y - fontSize;
yMax = y;
base = y;
break;
case 1:
xMin = x;
yMin = y - fontSize;
xMax = x + fontSize;
base = x;
break;
case 2:
yMin = y;
xMax = x + fontSize;
yMax = y + fontSize;
base = y;
break;
case 3:
xMin = x - fontSize;
xMax = x;
yMax = y + fontSize;
base = x;
break;
}
} else { // horizontal writing mode
switch (rot) {
case 0:
xMin = x;
yMin = y - ascent;
yMax = y - descent;
if (yMin == yMax) {
// this is a sanity check for a case that shouldn't happen -- but
// if it does happen, we want to avoid dividing by zero later
yMin = y;
yMax = y + 1;
}
base = y;
break;
case 1:
xMin = x + descent;
yMin = y;
xMax = x + ascent;
if (xMin == xMax) {
// this is a sanity check for a case that shouldn't happen -- but
// if it does happen, we want to avoid dividing by zero later
xMin = x;
xMax = x + 1;
}
base = x;
break;
case 2:
yMin = y + descent;
xMax = x;
yMax = y + ascent;
if (yMin == yMax) {
// this is a sanity check for a case that shouldn't happen -- but
// if it does happen, we want to avoid dividing by zero later
yMin = y;
yMax = y + 1;
}
base = y;
break;
case 3:
xMin = x - ascent;
xMax = x - descent;
yMax = y;
if (xMin == xMax) {
// this is a sanity check for a case that shouldn't happen -- but
// if it does happen, we want to avoid dividing by zero later
xMin = x;
xMax = x + 1;
}
base = x;
break;
}
}
}
void TextWord::ensureCapacity(int capacity) {
if (capacity > size) {
size = std::max(size + 16, capacity);
text = (Unicode *)greallocn(text, size, sizeof(Unicode));
charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
edge = (double *)greallocn(edge, (size + 1), sizeof(double));
charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
}
}
struct CombiningTable {
Unicode base;
Unicode comb;
};
static struct CombiningTable combiningTable[] = {
{0x0060, 0x0300}, // grave
{0x00a8, 0x0308}, // dieresis
{0x00af, 0x0304}, // macron
{0x00b4, 0x0301}, // acute
{0x00b8, 0x0327}, // cedilla
{0x02c6, 0x0302}, // circumflex
{0x02c7, 0x030c}, // caron
{0x02d8, 0x0306}, // breve
{0x02d9, 0x0307}, // dotaccent
{0x02da, 0x030a}, // ring
{0x02dc, 0x0303}, // tilde
{0x02dd, 0x030b} // hungarumlaut (double acute accent)
};
// returning combining versions of characters
static Unicode getCombiningChar(Unicode u) {
for (const CombiningTable &combining : combiningTable) {
if (u == combining.base)
return combining.comb;
}
return 0;
}
bool TextWord::addCombining(GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y,
double dx, double dy, int charPosA, int charLen,
CharCode c, Unicode u, const Matrix &textMatA) {
if (len == 0 || wMode != 0 || fontA->getWMode() != 0)
return false;
Unicode cCurrent = getCombiningChar(u);
Unicode cPrev = getCombiningChar(text[len-1]);
double edgeMid = (edge[len-1] + edge[len]) / 2;
double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta;
if (cCurrent != 0 && unicodeTypeAlphaNum(text[len-1])) {
// Current is a combining character, previous is base character
maxScaledMidDelta = fabs(edge[len] - edge[len-1]) * combMaxMidDelta;
charMid = charBase = maxScaledBaseDelta = 0;
// Test if characters overlap
if (rot == 0 || rot == 2) {
charMid = x + (dx / 2);
charBase = y;
maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
} else {
charMid = y + (dy / 2);
charBase = x;
maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
}
if (fabs(charMid - edgeMid) >= maxScaledMidDelta ||
fabs(charBase - base) >= maxScaledBaseDelta)
return false;
// Add character, but don't adjust edge / bounding box because
// combining character's positioning could be odd.
ensureCapacity(len+1);
text[len] = cCurrent;
charcode[len] = c;
charPos[len] = charPosA;
charPos[len+1] = charPosA + charLen;
font[len] = fontA;
textMat[len] = textMatA;
edge[len+1] = edge[len];
edge[len] = (edge[len+1] + edge[len-1]) / 2;
++len;
return true;
}
if (cPrev != 0 && unicodeTypeAlphaNum(u)) {
// Previous is a combining character, current is base character
maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
charMid = charBase = maxScaledMidDelta = 0;
// Test if characters overlap
if (rot == 0 || rot == 2) {
charMid = x + (dx / 2);
charBase = y;
maxScaledMidDelta = fabs(dx * combMaxMidDelta);
} else {
charMid = y + (dy / 2);
charBase = x;
maxScaledMidDelta = fabs(dy * combMaxMidDelta);
}
if (fabs(charMid - edgeMid) >= maxScaledMidDelta ||
fabs(charBase - base) >= maxScaledBaseDelta)
return false;
// move combining character to after base character
ensureCapacity(len+1);
fontSize = fontSizeA;
text[len] = cPrev;
charcode[len] = charcode[len-1];
charPos[len] = charPosA;
charPos[len+1] = charPosA + charLen;
font[len] = font[len-1];
textMat[len] = textMat[len-1];
text[len-1] = u;
charcode[len-1] = c;
font[len-1] = fontA;
textMat[len-1] = textMatA;
if (len == 1)
setInitialBounds(fontA, x, y);
// Updated edges / bounding box because we changed the base
// character.
if (wMode) {
switch (rot) {
case 0:
edge[len-1] = x - fontSize;
xMax = edge[len+1] = x;
break;
case 1:
edge[len-1] = y - fontSize;
yMax = edge[len+1] = y;
break;
case 2:
edge[len-1] = x + fontSize;
xMin = edge[len+1] = x;
break;
case 3:
edge[len-1] = y + fontSize;
yMin = edge[len+1] = y;
break;
}
} else {
switch (rot) {
case 0:
edge[len-1] = x;
xMax = edge[len+1] = x + dx;
break;
case 1:
edge[len-1] = y;
yMax = edge[len+1] = y + dy;
break;
case 2:
edge[len-1] = x;
xMin = edge[len+1] = x + dx;
break;
case 3:
edge[len-1] = y;
yMin = edge[len+1] = y + dy;
break;
}
}
edge[len] = (edge[len+1] + edge[len-1]) / 2;
++len;
return true;
}
return false;
}
void TextWord::merge(TextWord *word) {
int i;
if (word->xMin < xMin) {
xMin = word->xMin;
}
if (word->yMin < yMin) {
yMin = word->yMin;
}
if (word->xMax > xMax) {
xMax = word->xMax;
}
if (word->yMax > yMax) {
yMax = word->yMax;
}
ensureCapacity(len + word->len);
for (i = 0; i < word->len; ++i) {
text[len + i] = word->text[i];
charcode[len + i] = word->charcode[i];
edge[len + i] = word->edge[i];
charPos[len + i] = word->charPos[i];
font[len + i] = word->font[i];
textMat[len + i] = word->textMat[i];
}
edge[len + word->len] = word->edge[word->len];
charPos[len + word->len] = word->charPos[word->len];
len += word->len;
}
inline int TextWord::primaryCmp(TextWord *word) {
double cmp;
cmp = 0; // make gcc happy
switch (rot) {
case 0:
cmp = xMin - word->xMin;
break;
case 1:
cmp = yMin - word->yMin;
break;
case 2:
cmp = word->xMax - xMax;
break;
case 3:
cmp = word->yMax - yMax;
break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
double TextWord::primaryDelta(TextWord *word) {
double delta;
delta = 0; // make gcc happy
switch (rot) {
case 0:
delta = word->xMin - xMax;
break;
case 1:
delta = word->yMin - yMax;
break;
case 2:
delta = xMin - word->xMax;
break;
case 3:
delta = yMin - word->yMax;
break;
}
return delta;
}
int TextWord::cmpYX(const void *p1, const void *p2) {
TextWord *word1 = *(TextWord **)p1;
TextWord *word2 = *(TextWord **)p2;
double cmp;
cmp = word1->yMin - word2->yMin;
if (cmp == 0) {
cmp = word1->xMin - word2->xMin;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
#ifdef TEXTOUT_WORD_LIST
GooString *TextWord::getText() {
GooString *s;
UnicodeMap *uMap;
char buf[8];
int n, i;
s = new GooString();
if (!(uMap = globalParams->getTextEncoding())) {
return s;
}
for (i = 0; i < len; ++i) {
n = uMap->mapUnicode(text[i], buf, sizeof(buf));
s->append(buf, n);
}
uMap->decRefCnt();
return s;
}
void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
double *xMaxA, double *yMaxA) {
if (charIdx < 0 || charIdx >= len) {
return;
}
switch (rot) {
case 0:
*xMinA = edge[charIdx];
*xMaxA = edge[charIdx + 1];
*yMinA = yMin;
*yMaxA = yMax;
break;
case 1:
*xMinA = xMin;
*xMaxA = xMax;
*yMinA = edge[charIdx];
*yMaxA = edge[charIdx + 1];
break;
case 2:
*xMinA = edge[charIdx + 1];
*xMaxA = edge[charIdx];
*yMinA = yMin;
*yMaxA = yMax;
break;
case 3:
*xMinA = xMin;
*xMaxA = xMax;
*yMinA = edge[charIdx + 1];
*yMaxA = edge[charIdx];
break;
}
}
#endif // TEXTOUT_WORD_LIST
//------------------------------------------------------------------------
// TextPool
//------------------------------------------------------------------------
TextPool::TextPool() {
minBaseIdx = 0;
maxBaseIdx = -1;
pool = nullptr;
cursor = nullptr;
cursorBaseIdx = -1;
}
TextPool::~TextPool() {
int baseIdx;
TextWord *word, *word2;
for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
word2 = word->next;
delete word;
}
}
gfree(pool);
}
int TextPool::getBaseIdx(double base) {
const double baseIdxDouble = base / textPoolStep;
if (baseIdxDouble < minBaseIdx) {
return minBaseIdx;
}
if (baseIdxDouble > maxBaseIdx) {
return maxBaseIdx;
}
return (int)baseIdxDouble;
}
void TextPool::addWord(TextWord *word) {
int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
TextWord *w0, *w1;
// expand the array if needed
wordBaseIdx = (int)(word->base / textPoolStep);
if (unlikely(wordBaseIdx <= INT_MIN + 128 || wordBaseIdx >= INT_MAX - 128)) {
error(errSyntaxWarning, -1, "wordBaseIdx out of range");
delete word;
return;
}
if (minBaseIdx > maxBaseIdx) {
minBaseIdx = wordBaseIdx - 128;
maxBaseIdx = wordBaseIdx + 128;
pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
sizeof(TextWord *));
for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
pool[baseIdx - minBaseIdx] = nullptr;
}
} else if (wordBaseIdx < minBaseIdx) {
newMinBaseIdx = wordBaseIdx - 128;
TextWord **newPool = (TextWord **)gmallocn_checkoverflow(maxBaseIdx - newMinBaseIdx + 1,
sizeof(TextWord *));
if (unlikely(!newPool)) {
error(errSyntaxWarning, -1, "newPool would overflow");
delete word;
return;
}
for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
newPool[baseIdx - newMinBaseIdx] = nullptr;
}
memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
(maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
gfree(pool);
pool = newPool;
minBaseIdx = newMinBaseIdx;
} else if (wordBaseIdx > maxBaseIdx) {
newMaxBaseIdx = wordBaseIdx + 128;
TextWord **reallocatedPool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
sizeof(TextWord *), true /*checkoverflow*/, false /*free_pool*/);
if (!reallocatedPool) {
error(errSyntaxWarning, -1, "new pool size would overflow");
delete word;
return;
}
pool = reallocatedPool;
for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
pool[baseIdx - minBaseIdx] = nullptr;
}
maxBaseIdx = newMaxBaseIdx;
}
// insert the new word
if (cursor && wordBaseIdx == cursorBaseIdx &&
word->primaryCmp(cursor) >= 0) {
w0 = cursor;
w1 = cursor->next;
} else {
w0 = nullptr;
w1 = pool[wordBaseIdx - minBaseIdx];
}
for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
word->next = w1;
if (w0) {
w0->next = word;
} else {
pool[wordBaseIdx - minBaseIdx] = word;
}
cursor = word;
cursorBaseIdx = wordBaseIdx;
}
//------------------------------------------------------------------------
// TextLine
//------------------------------------------------------------------------
TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
blk = blkA;
rot = rotA;
base = baseA;
words = lastWord = nullptr;
text = nullptr;
edge = nullptr;
col = nullptr;
len = 0;
convertedLen = 0;
hyphenated = false;
next = nullptr;
xMin = yMin = 0;
xMax = yMax = -1;
normalized = nullptr;
normalized_len = 0;
normalized_idx = nullptr;
ascii_translation = nullptr;
ascii_len = 0;
ascii_idx = nullptr;
}
TextLine::~TextLine() {
TextWord *word;
while (words) {
word = words;
words = words->next;
delete word;
}
gfree(text);
gfree(edge);
gfree(col);
if (normalized) {
gfree(normalized);
gfree(normalized_idx);
}
if (ascii_translation) {
gfree(ascii_translation);
gfree(ascii_idx);
}
}
void TextLine::addWord(TextWord *word) {
if (lastWord) {
lastWord->next = word;
} else {
words = word;
}
lastWord = word;
if (xMin > xMax) {
xMin = word->xMin;
xMax = word->xMax;
yMin = word->yMin;
yMax = word->yMax;
} else {
if (word->xMin < xMin) {
xMin = word->xMin;
}
if (word->xMax > xMax) {
xMax = word->xMax;
}
if (word->yMin < yMin) {
yMin = word->yMin;
}
if (word->yMax > yMax) {
yMax = word->yMax;
}
}
}
double TextLine::primaryDelta(TextLine *line) {
double delta;
delta = 0; // make gcc happy
switch (rot) {
case 0:
delta = line->xMin - xMax;
break;
case 1:
delta = line->yMin - yMax;
break;
case 2:
delta = xMin - line->xMax;
break;
case 3:
delta = yMin - line->yMax;
break;
}
return delta;
}
int TextLine::primaryCmp(TextLine *line) {
double cmp;
cmp = 0; // make gcc happy
switch (rot) {
case 0:
cmp = xMin - line->xMin;
break;
case 1:
cmp = yMin - line->yMin;
break;
case 2:
cmp = line->xMax - xMax;
break;
case 3:
cmp = line->yMax - yMax;
break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
int TextLine::secondaryCmp(TextLine *line) {
double cmp;
cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
int TextLine::cmpYX(TextLine *line) {
int cmp;
if ((cmp = secondaryCmp(line))) {
return cmp;
}
return primaryCmp(line);
}
int TextLine::cmpXY(const void *p1, const void *p2) {
TextLine *line1 = *(TextLine **)p1;
TextLine *line2 = *(TextLine **)p2;
int cmp;
if ((cmp = line1->primaryCmp(line2))) {
return cmp;
}
return line1->secondaryCmp(line2);
}
void TextLine::coalesce(UnicodeMap *uMap) {
TextWord *word0, *word1;
double space, delta, minSpace;
bool isUnicode;
char buf[8];
int i, j;
if (words->next) {
// compute the inter-word space threshold
if (words->len > 1 || words->next->len > 1) {
minSpace = 0;
} else {
minSpace = words->primaryDelta(words->next);
for (word0 = words->next, word1 = word0->next;
word1 && minSpace > 0;
word0 = word1, word1 = word0->next) {
if (word1->len > 1) {
minSpace = 0;
}
delta = word0->primaryDelta(word1);
if (delta < minSpace) {
minSpace = delta;
}
}
}
if (minSpace <= 0) {
space = maxCharSpacing * words->fontSize;
} else {
space = maxWideCharSpacingMul * minSpace;
if (space > maxWideCharSpacing * words->fontSize) {
space = maxWideCharSpacing * words->fontSize;
}
}
// merge words
word0 = words;
word1 = words->next;
while (word1) {
if (word0->primaryDelta(word1) >= space) {
word0->spaceAfter = true;
word0 = word1;
word1 = word1->next;
} else if (word0->font[word0->len - 1] == word1->font[0] &&
word0->underlined == word1->underlined &&
fabs(word0->fontSize - word1->fontSize) <
maxWordFontSizeDelta * words->fontSize &&
word1->charPos[0] == word0->charPos[word0->len]) {
word0->merge(word1);
word0->next = word1->next;
delete word1;
word1 = word0->next;
} else {
word0 = word1;
word1 = word1->next;
}
}
}
// build the line text
isUnicode = uMap ? uMap->isUnicode() : false;
len = 0;
for (word1 = words; word1; word1 = word1->next) {
len += word1->len;
if (word1->spaceAfter) {
++len;
}
}
text = (Unicode *)gmallocn(len, sizeof(Unicode));
edge = (double *)gmallocn(len + 1, sizeof(double));
i = 0;
for (word1 = words; word1; word1 = word1->next) {
for (j = 0; j < word1->len; ++j) {
text[i] = word1->text[j];
edge[i] = word1->edge[j];
++i;
}
edge[i] = word1->edge[word1->len];
if (word1->spaceAfter) {
text[i] = (Unicode)0x0020;
++i;
}
}
// compute convertedLen and set up the col array
col = (int *)gmallocn(len + 1, sizeof(int));
convertedLen = 0;
for (i = 0; i < len; ++i) {
col[i] = convertedLen;
if (isUnicode) {
++convertedLen;
} else if (uMap) {
convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
}
}
col[len] = convertedLen;
// check for hyphen at end of line
//~ need to check for other chars used as hyphens
hyphenated = text[len - 1] == (Unicode)'-';
}
//------------------------------------------------------------------------
// TextLineFrag
//------------------------------------------------------------------------
class TextLineFrag {
public:
TextLine *line; // the line object
int start, len; // offset and length of this fragment
// (in Unicode chars)
double xMin, xMax; // bounding box coordinates
double yMin, yMax;
double base; // baseline virtual coordinate
int col; // first column
void init(TextLine *lineA, int startA, int lenA);
void computeCoords(bool oneRot);
static int cmpYXPrimaryRot(const void *p1, const void *p2);
static int cmpYXLineRot(const void *p1, const void *p2);
static int cmpXYLineRot(const void *p1, const void *p2);
static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
static int cmpXYColumnLineRot(const void *p1, const void *p2);
};
void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
line = lineA;
start = startA;
len = lenA;
col = line->col[start];
}
void TextLineFrag::computeCoords(bool oneRot) {
TextBlock *blk;
double d0, d1, d2, d3, d4;
if (oneRot) {
switch (line->rot) {
case 0:
xMin = line->edge[start];
xMax = line->edge[start + len];
yMin = line->yMin;
yMax = line->yMax;
break;
case 1:
xMin = line->xMin;
xMax = line->xMax;
yMin = line->edge[start];
yMax = line->edge[start + len];
break;
case 2:
xMin = line->edge[start + len];
xMax = line->edge[start];
yMin = line->yMin;
yMax = line->yMax;
break;
case 3:
xMin = line->xMin;
xMax = line->xMax;
yMin = line->edge[start + len];
yMax = line->edge[start];
break;
}
base = line->base;
} else {
if (line->rot == 0 && line->blk->page->primaryRot == 0) {
xMin = line->edge[start];
xMax = line->edge[start + len];
yMin = line->yMin;
yMax = line->yMax;
base = line->base;
} else {
blk = line->blk;
d0 = line->edge[start];
d1 = line->edge[start + len];
d2 = d3 = d4 = 0; // make gcc happy
switch (line->rot) {
case 0:
d2 = line->yMin;
d3 = line->yMax;
d4 = line->base;
d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
break;
case 1:
d2 = line->xMax;
d3 = line->xMin;
d4 = line->base;
d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
break;
case 2:
d2 = line->yMax;
d3 = line->yMin;
d4 = line->base;
d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
break;
case 3:
d2 = line->xMin;
d3 = line->xMax;
d4 = line->base;
d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
break;
}
switch (line->blk->page->primaryRot) {
case 0:
xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
base = blk->yMin + d4 * (blk->yMax - blk->yMin);
break;
case 1:
xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
base = blk->xMax - d4 * (blk->xMax - blk->xMin);
break;
case 2:
xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
base = blk->yMax - d4 * (blk->yMax - blk->yMin);
break;
case 3:
xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
base = blk->xMin + d4 * (blk->xMax - blk->xMin);
break;
}
}
}
}
int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
TextLineFrag *frag1 = (TextLineFrag *)p1;
TextLineFrag *frag2 = (TextLineFrag *)p2;
double cmp;
cmp = 0; // make gcc happy
switch (frag1->line->blk->page->primaryRot) {
case 0:
if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
cmp = frag1->xMin - frag2->xMin;
}
break;
case 1:
if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
cmp = frag1->yMin - frag2->yMin;
}
break;
case 2:
if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
cmp = frag2->xMax - frag1->xMax;
}
break;
case 3:
if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
cmp = frag2->yMax - frag1->yMax;
}
break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
TextLineFrag *frag1 = (TextLineFrag *)p1;
TextLineFrag *frag2 = (TextLineFrag *)p2;
double cmp;
cmp = 0; // make gcc happy
switch (frag1->line->rot) {
case 0:
if ((cmp = frag1->yMin - frag2->yMin) == 0) {
cmp = frag1->xMin - frag2->xMin;
}
break;
case 1:
if ((cmp = frag2->xMax - frag1->xMax) == 0) {
cmp = frag1->yMin - frag2->yMin;
}
break;
case 2:
if ((cmp = frag2->yMin - frag1->yMin) == 0) {
cmp = frag2->xMax - frag1->xMax;
}
break;
case 3:
if ((cmp = frag1->xMax - frag2->xMax) == 0) {
cmp = frag2->yMax - frag1->yMax;
}
break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
TextLineFrag *frag1 = (TextLineFrag *)p1;
TextLineFrag *frag2 = (TextLineFrag *)p2;
double cmp;
cmp = 0; // make gcc happy
switch (frag1->line->rot) {
case 0:
if ((cmp = frag1->xMin - frag2->xMin) == 0) {
cmp = frag1->yMin - frag2->yMin;
}
break;
case 1:
if ((cmp = frag1->yMin - frag2->yMin) == 0) {
cmp = frag2->xMax - frag1->xMax;
}
break;
case 2:
if ((cmp = frag2->xMax - frag1->xMax) == 0) {
cmp = frag2->yMin - frag1->yMin;
}
break;
case 3:
if ((cmp = frag2->yMax - frag1->yMax) == 0) {
cmp = frag1->xMax - frag2->xMax;
}
break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) {
TextLineFrag *frag1 = (TextLineFrag *)p1;
TextLineFrag *frag2 = (TextLineFrag *)p2;
double cmp;
// if columns overlap, compare y values
if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
frag2->line->col[frag2->start]) &&
frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
frag1->line->col[frag1->start])) {
cmp = 0; // make gcc happy
switch (frag1->line->blk->page->primaryRot) {
case 0: cmp = frag1->yMin - frag2->yMin; break;
case 1: cmp = frag2->xMax - frag1->xMax; break;
case 2: cmp = frag2->yMin - frag1->yMin; break;
case 3: cmp = frag1->xMax - frag2->xMax; break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
// otherwise, compare starting column
return frag1->col - frag2->col;
}
int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) {
TextLineFrag *frag1 = (TextLineFrag *)p1;
TextLineFrag *frag2 = (TextLineFrag *)p2;
double cmp;
// if columns overlap, compare y values
if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
frag2->line->col[frag2->start]) &&
frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
frag1->line->col[frag1->start])) {
cmp = 0; // make gcc happy
switch (frag1->line->rot) {
case 0: cmp = frag1->yMin - frag2->yMin; break;
case 1: cmp = frag2->xMax - frag1->xMax; break;
case 2: cmp = frag2->yMin - frag1->yMin; break;
case 3: cmp = frag1->xMax - frag2->xMax; break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
// otherwise, compare starting column
return frag1->col - frag2->col;
}
//------------------------------------------------------------------------
// TextBlock
//------------------------------------------------------------------------
TextBlock::TextBlock(TextPage *pageA, int rotA) {
page = pageA;
rot = rotA;
xMin = yMin = 0;
xMax = yMax = -1;
priMin = 0;
priMax = page->pageWidth;
pool = new TextPool();
lines = nullptr;
curLine = nullptr;
next = nullptr;
stackNext = nullptr;
tableId = -1;
tableEnd = false;
}
TextBlock::~TextBlock() {
TextLine *line;
delete pool;
while (lines) {
line = lines;
lines = lines->next;
delete line;
}
}
void TextBlock::addWord(TextWord *word) {
pool->addWord(word);
if (xMin > xMax) {
xMin = word->xMin;
xMax = word->xMax;
yMin = word->yMin;
yMax = word->yMax;
} else {
if (word->xMin < xMin) {
xMin = word->xMin;
}
if (word->xMax > xMax) {
xMax = word->xMax;
}
if (word->yMin < yMin) {
yMin = word->yMin;
}
if (word->yMax > yMax) {
yMax = word->yMax;
}
}
}
void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) {
TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
TextLine *line, *line0, *line1;
int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
int baseIdx, bestWordBaseIdx, idx0, idx1;
double minBase, maxBase;
double fontSize, wordSpacing, delta, priDelta, secDelta;
TextLine **lineArray;
bool found, overlap;
int col1, col2;
int i, j, k;
// discard duplicated text (fake boldface, drop shadows)
for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
word0 = pool->getPool(idx0);
while (word0) {
priDelta = dupMaxPriDelta * word0->fontSize;
secDelta = dupMaxSecDelta * word0->fontSize;
maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
found = false;
word1 = word2 = nullptr; // make gcc happy
for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
if (idx1 == idx0) {
word1 = word0;
word2 = word0->next;
} else {
word1 = nullptr;
word2 = pool->getPool(idx1);
}
for (; word2; word1 = word2, word2 = word2->next) {
if (word2->len == word0->len &&
!memcmp(word2->text, word0->text,
word0->len * sizeof(Unicode))) {
switch (rot) {
case 0:
case 2:
found = fabs(word0->xMin - word2->xMin) < priDelta &&
fabs(word0->xMax - word2->xMax) < priDelta &&
fabs(word0->yMin - word2->yMin) < secDelta &&
fabs(word0->yMax - word2->yMax) < secDelta;
break;
case 1:
case 3:
found = fabs(word0->xMin - word2->xMin) < secDelta &&
fabs(word0->xMax - word2->xMax) < secDelta &&
fabs(word0->yMin - word2->yMin) < priDelta &&
fabs(word0->yMax - word2->yMax) < priDelta;
break;
}
}
if (found) {
break;
}
}
if (found) {
break;
}
}
if (found) {
if (word1) {
word1->next = word2->next;
} else {
pool->setPool(idx1, word2->next);
}
delete word2;
} else {
word0 = word0->next;
}
}
}
// build the lines
curLine = nullptr;
poolMinBaseIdx = pool->minBaseIdx;
charCount = 0;
nLines = 0;
while (1) {
// find the first non-empty line in the pool
for (;
poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
++poolMinBaseIdx) ;
if (poolMinBaseIdx > pool->maxBaseIdx) {
break;
}
// look for the left-most word in the first four lines of the
// pool -- this avoids starting with a superscript word
startBaseIdx = poolMinBaseIdx;
for (baseIdx = poolMinBaseIdx + 1;
baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
++baseIdx) {
if (!pool->getPool(baseIdx)) {
continue;
}
if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
< 0) {
startBaseIdx = baseIdx;
}
}
// create a new line
word0 = pool->getPool(startBaseIdx);
pool->setPool(startBaseIdx, word0->next);
word0->next = nullptr;
line = new TextLine(this, word0->rot, word0->base);
line->addWord(word0);
lastWord = word0;
// compute the search range
fontSize = word0->fontSize;
minBase = word0->base - maxIntraLineDelta * fontSize;
maxBase = word0->base + maxIntraLineDelta * fontSize;
minBaseIdx = pool->getBaseIdx(minBase);
maxBaseIdx = pool->getBaseIdx(maxBase);
wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
// find the rest of the words in this line
while (1) {
// find the left-most word whose baseline is in the range for
// this line
bestWordBaseIdx = 0;
bestWord0 = bestWord1 = nullptr;
overlap = false;
for (baseIdx = minBaseIdx;
!overlap && baseIdx <= maxBaseIdx;
++baseIdx) {
for (word0 = nullptr, word1 = pool->getPool(baseIdx);
word1;
word0 = word1, word1 = word1->next) {
if (word1->base >= minBase &&
word1->base <= maxBase) {
delta = lastWord->primaryDelta(word1);
if (delta < minCharSpacing * fontSize) {
overlap = true;
break;
} else {
if (delta < wordSpacing &&
(!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
bestWordBaseIdx = baseIdx;
bestWord0 = word0;
bestWord1 = word1;
}
break;
}
}
}
}
if (overlap || !bestWord1) {
break;
}
// remove it from the pool, and add it to the line
if (bestWord0) {
bestWord0->next = bestWord1->next;
} else {
pool->setPool(bestWordBaseIdx, bestWord1->next);
}
bestWord1->next = nullptr;
line->addWord(bestWord1);
lastWord = bestWord1;
}
// add the line
if (curLine && line->cmpYX(curLine) > 0) {
line0 = curLine;
line1 = curLine->next;
} else {
line0 = nullptr;
line1 = lines;
}
for (;
line1 && line->cmpYX(line1) > 0;
line0 = line1, line1 = line1->next) ;
if (line0) {
line0->next = line;
} else {
lines = line;
}
line->next = line1;
curLine = line;
line->coalesce(uMap);
charCount += line->len;
++nLines;
}
// sort lines into xy order for column assignment
lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
for (line = lines, i = 0; line; line = line->next, ++i) {
lineArray[i] = line;
}
qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
// column assignment
nColumns = 0;
if (fixedPitch) {
for (i = 0; i < nLines; ++i) {
line0 = lineArray[i];
col1 = 0; // make gcc happy
switch (rot) {
case 0:
col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
break;
case 1:
col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
break;
case 2:
col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
break;
case 3:
col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
break;
}
for (k = 0; k <= line0->len; ++k) {
line0->col[k] += col1;
}
if (line0->col[line0->len] > nColumns) {
nColumns = line0->col[line0->len];
}
}
} else {
for (i = 0; i < nLines; ++i) {
line0 = lineArray[i];
col1 = 0;
for (j = 0; j < i; ++j) {
line1 = lineArray[j];
if (line1->primaryDelta(line0) >= 0) {
col2 = line1->col[line1->len] + 1;
} else {
k = 0; // make gcc happy
switch (rot) {
case 0:
for (k = 0;
k < line1->len &&
line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
++k) ;
break;
case 1:
for (k = 0;
k < line1->len &&
line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
++k) ;
break;
case 2:
for (k = 0;
k < line1->len &&
line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
++k) ;
break;
case 3:
for (k = 0;
k < line1->len &&
line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
++k) ;
break;
}
col2 = line1->col[k];
}
if (col2 > col1) {
col1 = col2;
}
}
for (k = 0; k <= line0->len; ++k) {
line0->col[k] += col1;
}
if (line0->col[line0->len] > nColumns) {
nColumns = line0->col[line0->len];
}
}
}
gfree(lineArray);
}
void TextBlock::updatePriMinMax(TextBlock *blk) {
double newPriMin, newPriMax;
bool gotPriMin, gotPriMax;
gotPriMin = gotPriMax = false;
newPriMin = newPriMax = 0; // make gcc happy
switch (page->primaryRot) {
case 0:
case 2:
if (blk->yMin < yMax && blk->yMax > yMin) {
if (blk->xMin < xMin) {
newPriMin = blk->xMax;
gotPriMin = true;
}
if (blk->xMax > xMax) {
newPriMax = blk->xMin;
gotPriMax = true;
}
}
break;
case 1:
case 3:
if (blk->xMin < xMax && blk->xMax > xMin) {
if (blk->yMin < yMin) {
newPriMin = blk->yMax;
gotPriMin = true;
}
if (blk->yMax > yMax) {
newPriMax = blk->yMin;
gotPriMax = true;
}
}
break;
}
if (gotPriMin) {
if (newPriMin > xMin) {
newPriMin = xMin;
}
if (newPriMin > priMin) {
priMin = newPriMin;
}
}
if (gotPriMax) {
if (newPriMax < xMax) {
newPriMax = xMax;
}
if (newPriMax < priMax) {
priMax = newPriMax;
}
}
}
int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
TextBlock *blk1 = *(TextBlock **)p1;
TextBlock *blk2 = *(TextBlock **)p2;
double cmp;
cmp = 0; // make gcc happy
switch (blk1->page->primaryRot) {
case 0:
if ((cmp = blk1->xMin - blk2->xMin) == 0) {
cmp = blk1->yMin - blk2->yMin;
}
break;
case 1:
if ((cmp = blk1->yMin - blk2->yMin) == 0) {
cmp = blk2->xMax - blk1->xMax;
}
break;
case 2:
if ((cmp = blk2->xMax - blk1->xMax) == 0) {
cmp = blk2->yMin - blk1->yMin;
}
break;
case 3:
if ((cmp = blk2->yMax - blk1->yMax) == 0) {
cmp = blk1->xMax - blk2->xMax;
}
break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
TextBlock *blk1 = *(TextBlock **)p1;
TextBlock *blk2 = *(TextBlock **)p2;
double cmp;
cmp = 0; // make gcc happy
switch (blk1->page->primaryRot) {
case 0:
if ((cmp = blk1->yMin - blk2->yMin) == 0) {
cmp = blk1->xMin - blk2->xMin;
}
break;
case 1:
if ((cmp = blk2->xMax - blk1->xMax) == 0) {
cmp = blk1->yMin - blk2->yMin;
}
break;
case 2:
if ((cmp = blk2->yMin - blk1->yMin) == 0) {
cmp = blk2->xMax - blk1->xMax;
}
break;
case 3:
if ((cmp = blk1->xMax - blk2->xMax) == 0) {
cmp = blk2->yMax - blk1->yMax;
}
break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
int TextBlock::primaryCmp(TextBlock *blk) {
double cmp;
cmp = 0; // make gcc happy
switch (rot) {
case 0:
cmp = xMin - blk->xMin;
break;
case 1:
cmp = yMin - blk->yMin;
break;
case 2:
cmp = blk->xMax - xMax;
break;
case 3:
cmp = blk->yMax - yMax;
break;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
double TextBlock::secondaryDelta(TextBlock *blk) {
double delta;
delta = 0; // make gcc happy
switch (rot) {
case 0:
delta = blk->yMin - yMax;
break;
case 1:
delta = xMin - blk->xMax;
break;
case 2:
delta = yMin - blk->yMax;
break;
case 3:
delta = blk->xMin - xMax;
break;
}
return delta;
}
bool TextBlock::isBelow(TextBlock *blk) {
bool below;
below = false; // make gcc happy
switch (page->primaryRot) {
case 0:
below = xMin >= blk->priMin && xMax <= blk->priMax &&
yMin > blk->yMin;
break;
case 1:
below = yMin >= blk->priMin && yMax <= blk->priMax &&
xMax < blk->xMax;
break;
case 2:
below = xMin >= blk->priMin && xMax <= blk->priMax &&
yMax < blk->yMax;
break;
case 3:
below = yMin >= blk->priMin && yMax <= blk->priMax &&
xMin > blk->xMin;
break;
}
return below;
}
bool TextBlock::isBeforeByRule1(TextBlock *blk1) {
bool before = false;
bool overlap = false;
switch (this->page->primaryRot) {
case 0:
case 2:
overlap = ((this->ExMin <= blk1->ExMin) &&
(blk1->ExMin <= this->ExMax)) ||
((blk1->ExMin <= this->ExMin) &&
(this->ExMin <= blk1->ExMax));
break;
case 1:
case 3:
overlap = ((this->EyMin <= blk1->EyMin) &&
(blk1->EyMin <= this->EyMax)) ||
((blk1->EyMin <= this->EyMin) &&
(this->EyMin <= blk1->EyMax));
break;
}
switch (this->page->primaryRot) {
case 0:
before = overlap && this->EyMin < blk1->EyMin;
break;
case 1:
before = overlap && this->ExMax > blk1->ExMax;
break;
case 2:
before = overlap && this->EyMax > blk1->EyMax;
break;
case 3:
before = overlap && this->ExMin < blk1->ExMin;
break;
}
return before;
}
bool TextBlock::isBeforeByRule2(TextBlock *blk1) {
double cmp = 0;
int rotLR = rot;
if (!page->primaryLR) {
rotLR = (rotLR + 2) % 4;
}
switch (rotLR) {
case 0:
cmp = ExMax - blk1->ExMin;
break;
case 1:
cmp = EyMin - blk1->EyMax;
break;
case 2:
cmp = blk1->ExMax - ExMin;
break;
case 3:
cmp = blk1->EyMin - EyMax;
break;
}
return cmp <= 0;
}
// Sort into reading order by performing a topological sort using the rules
// given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
// See http://pubs.iupr.org/#2003-breuel-sdiut
// Topological sort is done by depth first search, see
// http://en.wikipedia.org/wiki/Topological_sorting
int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1,
TextBlock **sorted, int sortPos,
bool* visited,
TextBlock **cache, int cacheSize) {
int pos2;
TextBlock *blk1, *blk2, *blk3;
bool before;
if (visited[pos1]) {
return sortPos;
}
blk1 = this;
#if 0 // for debugging
printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
#endif
visited[pos1] = true;
pos2 = -1;
for (blk2 = blkList; blk2; blk2 = blk2->next) {
pos2++;
if (visited[pos2]) {
// skip visited nodes
continue;
}
before = false;
// is blk2 before blk1? (for table entries)
if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
if (page->primaryLR) {
if (blk2->xMax <= blk1->xMin &&
blk2->yMin <= blk1->yMax &&
blk2->yMax >= blk1->yMin)
before = true;
} else {
if (blk2->xMin >= blk1->xMax &&
blk2->yMin <= blk1->yMax &&
blk2->yMax >= blk1->yMin)
before = true;
}
if (blk2->yMax <= blk1->yMin)
before = true;
} else {
if (blk2->isBeforeByRule1(blk1)) {
// Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
before = true;
#if 0 // for debugging
printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
#endif
} else if (blk2->isBeforeByRule2(blk1)) {
// Rule (2) blk2 left of blk1, and no intervening blk3
// such that blk1 is before blk3 by rule 1,
// and blk3 is before blk2 by rule 1.
before = true;
for (int i = 0; i < cacheSize && cache[i]; ++i) {
if (blk1->isBeforeByRule1(cache[i]) &&
cache[i]->isBeforeByRule1(blk2)) {
before = false;
std::rotate(cache, cache + i, cache + i + 1);
break;
}
}
if (before) {
for (blk3 = blkList; blk3; blk3 = blk3->next) {
if (blk3 == blk2 || blk3 == blk1) {
continue;
}
if (blk1->isBeforeByRule1(blk3) &&
blk3->isBeforeByRule1(blk2)) {
before = false;
std::copy_backward(cache, cache + cacheSize - 1,
cache + cacheSize);
cache[0] = blk3;
break;
}
}
}
#if 0 // for debugging
if (before) {
printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
}
#endif
}
}
if (before) {
// blk2 is before blk1, so it needs to be visited
// before we can add blk1 to the sorted list.
sortPos = blk2->visitDepthFirst(blkList, pos2, sorted, sortPos, visited, cache, cacheSize);
}
}
#if 0 // for debugging
printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
#endif
sorted[sortPos++] = blk1;
return sortPos;
}
int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1,
TextBlock **sorted, int sortPos,
bool* visited) {
const int blockCacheSize = 4;
TextBlock *blockCache[blockCacheSize];
std::fill(blockCache, blockCache + blockCacheSize, nullptr);
return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, blockCache,
blockCacheSize);
}
//------------------------------------------------------------------------
// TextFlow
//------------------------------------------------------------------------
TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
page = pageA;
xMin = blk->xMin;
xMax = blk->xMax;
yMin = blk->yMin;
yMax = blk->yMax;
priMin = blk->priMin;
priMax = blk->priMax;
blocks = lastBlk = blk;
next = nullptr;
}
TextFlow::~TextFlow() {
TextBlock *blk;
while (blocks) {
blk = blocks;
blocks = blocks->next;
delete blk;
}
}
void TextFlow::addBlock(TextBlock *blk) {
if (lastBlk) {
lastBlk->next = blk;
} else {
blocks = blk;
}
lastBlk = blk;
if (blk->xMin < xMin) {
xMin = blk->xMin;
}
if (blk->xMax > xMax) {
xMax = blk->xMax;
}
if (blk->yMin < yMin) {
yMin = blk->yMin;
}
if (blk->yMax > yMax) {
yMax = blk->yMax;
}
}
bool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
bool fits;
// lower blocks must use smaller fonts
if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
return false;
}
fits = false; // make gcc happy
switch (page->primaryRot) {
case 0:
fits = blk->xMin >= priMin && blk->xMax <= priMax;
break;
case 1:
fits = blk->yMin >= priMin && blk->yMax <= priMax;
break;
case 2:
fits = blk->xMin >= priMin && blk->xMax <= priMax;
break;
case 3:
fits = blk->yMin >= priMin && blk->yMax <= priMax;
break;
}
return fits;
}
#ifdef TEXTOUT_WORD_LIST
//------------------------------------------------------------------------
// TextWordList
//------------------------------------------------------------------------
TextWordList::TextWordList(TextPage *text, bool physLayout) {
TextFlow *flow;
TextBlock *blk;
TextLine *line;
TextWord *word;
TextWord **wordArray;
int nWords, i;
words = new std::vector<TextWord*>();
if (text->rawOrder) {
for (word = text->rawWords; word; word = word->next) {
words->push_back(word);
}
} else if (physLayout) {
// this is inefficient, but it's also the least useful of these
// three cases
nWords = 0;
for (flow = text->flows; flow; flow = flow->next) {
for (blk = flow->blocks; blk; blk = blk->next) {
for (line = blk->lines; line; line = line->next) {
for (word = line->words; word; word = word->next) {
++nWords;
}
}
}
}
wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
i = 0;
for (flow = text->flows; flow; flow = flow->next) {
for (blk = flow->blocks; blk; blk = blk->next) {
for (line = blk->lines; line; line = line->next) {
for (word = line->words; word; word = word->next) {
wordArray[i++] = word;
}
}
}
}
qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
for (i = 0; i < nWords; ++i) {
words->push_back(wordArray[i]);
}
gfree(wordArray);
} else {
for (flow = text->flows; flow; flow = flow->next) {
for (blk = flow->blocks; blk; blk = blk->next) {
for (line = blk->lines; line; line = line->next) {
for (word = line->words; word; word = word->next) {
words->push_back(word);
}
}
}
}
}
}
TextWordList::~TextWordList() {
delete words;
}
int TextWordList::getLength() {
return words->size();
}
TextWord *TextWordList::get(int idx) {
if (idx < 0 || idx >= (int)words->size()) {
return nullptr;
}
return (*words)[idx];
}
#endif // TEXTOUT_WORD_LIST
//------------------------------------------------------------------------
// TextPage
//------------------------------------------------------------------------
TextPage::TextPage(bool rawOrderA, bool discardDiagA) {
int rot;
refCnt = 1;
rawOrder = rawOrderA;
discardDiag = discardDiagA;
curWord = nullptr;
charPos = 0;
curFont = nullptr;
curFontSize = 0;
nest = 0;
nTinyChars = 0;
lastCharOverlap = false;
if (!rawOrder) {
for (rot = 0; rot < 4; ++rot) {
pools[rot] = new TextPool();
}
}
flows = nullptr;
blocks = nullptr;
rawWords = nullptr;
rawLastWord = nullptr;
fonts = new std::vector<TextFontInfo*>();
lastFindXMin = lastFindYMin = 0;
haveLastFind = false;
underlines = new std::vector<TextUnderline*>();
links = new std::vector<TextLink*>();
mergeCombining = true;
diagonal = false;
}
TextPage::~TextPage() {
int rot;
clear();
if (!rawOrder) {
for (rot = 0; rot < 4; ++rot) {
delete pools[rot];
}
}
delete fonts;
for (auto entry : *underlines) {
delete entry;
}
delete underlines;
for (auto entry : *links) {
delete entry;
}
delete links;
}
void TextPage::incRefCnt() {
refCnt++;
}
void TextPage::decRefCnt() {
if (--refCnt == 0)
delete this;
}
void TextPage::startPage(GfxState *state) {
clear();
if (state) {
pageWidth = state->getPageWidth();
pageHeight = state->getPageHeight();
} else {
pageWidth = pageHeight = 0;
}
}
void TextPage::endPage() {
if (curWord) {
endWord();
}
}
void TextPage::clear() {
int rot;
TextFlow *flow;
TextWord *word;
if (curWord) {
delete curWord;
curWord = nullptr;
}
if (rawOrder) {
while (rawWords) {
word = rawWords;
rawWords = rawWords->next;
delete word;
}
} else {
for (rot = 0; rot < 4; ++rot) {
delete pools[rot];
}
while (flows) {
flow = flows;
flows = flows->next;
delete flow;
}
gfree(blocks);
}
for (auto entry : *fonts) {
delete entry;
}
delete fonts;
for (auto entry : *underlines) {
delete entry;
}
delete underlines;
for (auto entry : *links) {
delete entry;
}
delete links;
diagonal = false;
curWord = nullptr;
charPos = 0;
curFont = nullptr;
curFontSize = 0;
nest = 0;
nTinyChars = 0;
if (!rawOrder) {
for (rot = 0; rot < 4; ++rot) {
pools[rot] = new TextPool();
}
}
flows = nullptr;
blocks = nullptr;
rawWords = nullptr;
rawLastWord = nullptr;
fonts = new std::vector<TextFontInfo*>();
underlines = new std::vector<TextUnderline*>();
links = new std::vector<TextLink*>();
}
void TextPage::updateFont(GfxState *state) {
GfxFont *gfxFont;
const double *fm;
const char *name;
int code, mCode, letterCode, anyCode;
double w;
// get the font info object
curFont = nullptr;
for (std::size_t i = 0; i < fonts->size(); ++i) {
curFont = (*fonts)[i];
if (curFont->matches(state)) {
break;
}
curFont = nullptr;
}
if (!