| //======================================================================== |
| // |
| // TextOutputDev.cc |
| // |
| // Copyright 1997-2003 Glyph & Cog, LLC |
| // |
| //======================================================================== |
| |
| //======================================================================== |
| // |
| // Modified under the Poppler project - http://poppler.freedesktop.org |
| // |
| // All changes made under the Poppler project to this file are licensed |
| // under GPL version 2 or later |
| // |
| // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com> |
| // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru> |
| // Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org> |
| // Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk> |
| // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net> |
| // Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com> |
| // Copyright (C) 2008 Koji Otani <sho@bbr.jp> |
| // Copyright (C) 2008, 2010-2012, 2014-2019 Albert Astals Cid <aacid@kde.org> |
| // Copyright (C) 2008 Pino Toscano <pino@kde.org> |
| // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl> |
| // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au> |
| // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net> |
| // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com> |
| // Copyright (C) 2010 Marek Kasik <mkasik@redhat.com> |
| // Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp> |
| // Copyright (C) 2011 Sam Liao <phyomh@gmail.com> |
| // Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de> |
| // Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us> |
| // Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de> |
| // Copyright (C) 2013 José Aliste <jaliste@src.gnome.org> |
| // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de> |
| // Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk> |
| // Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org> |
| // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
| // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com> |
| // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de> |
| // Copyright (C) 2018, 2019 Nelson Benítez León <nbenitezl@gmail.com> |
| // Copyright (C) 2019 Christian Persch <chpe@src.gnome.org> |
| // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de> |
| // |
| // To see a description of the changes please see the Changelog file that |
| // came with your tarball or type make ChangeLog if you are building from git |
| // |
| //======================================================================== |
| |
| #include <config.h> |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <stddef.h> |
| #include <cmath> |
| #include <float.h> |
| #include <ctype.h> |
| #include <algorithm> |
| #ifdef _WIN32 |
| #include <fcntl.h> // for O_BINARY |
| #include <io.h> // for setmode |
| #endif |
| #include "goo/gfile.h" |
| #include "goo/gmem.h" |
| #include "goo/GooString.h" |
| #include "poppler-config.h" |
| #include "Error.h" |
| #include "GlobalParams.h" |
| #include "UnicodeMap.h" |
| #include "UnicodeTypeTable.h" |
| #include "Link.h" |
| #include "TextOutputDev.h" |
| #include "Page.h" |
| #include "Annot.h" |
| #include "UTF.h" |
| |
| //------------------------------------------------------------------------ |
| // parameters |
| //------------------------------------------------------------------------ |
| |
| // Each bucket in a text pool includes baselines within a range of |
| // this many points. |
| #define textPoolStep 4 |
| |
| // Inter-character space width which will cause addChar to start a new |
| // word. |
| #define minWordBreakSpace 0.1 |
| |
| // Negative inter-character space width, i.e., overlap, which will |
| // cause addChar to start a new word. |
| #define minDupBreakOverlap 0.2 |
| |
| // Max distance between baselines of two lines within a block, as a |
| // fraction of the font size. |
| #define maxLineSpacingDelta 1.5 |
| |
| // Max difference in primary font sizes on two lines in the same |
| // block. Delta1 is used when examining new lines above and below the |
| // current block; delta2 is used when examining text that overlaps the |
| // current block; delta3 is used when examining text to the left and |
| // right of the current block. |
| #define maxBlockFontSizeDelta1 0.05 |
| #define maxBlockFontSizeDelta2 0.6 |
| #define maxBlockFontSizeDelta3 0.2 |
| |
| // Max difference in font sizes inside a word. |
| #define maxWordFontSizeDelta 0.05 |
| |
| // Maximum distance between baselines of two words on the same line, |
| // e.g., distance between subscript or superscript and the primary |
| // baseline, as a fraction of the font size. |
| #define maxIntraLineDelta 0.5 |
| |
| // Minimum inter-word spacing, as a fraction of the font size. (Only |
| // used for raw ordering.) |
| #define minWordSpacing 0.15 |
| |
| // Maximum inter-word spacing, as a fraction of the font size. |
| #define maxWordSpacing 1.5 |
| |
| // Maximum horizontal spacing which will allow a word to be pulled |
| // into a block. |
| #define minColSpacing1 0.3 |
| |
| // Minimum spacing between columns, as a fraction of the font size. |
| #define minColSpacing2 1.0 |
| |
| // Maximum vertical spacing between blocks within a flow, as a |
| // multiple of the font size. |
| #define maxBlockSpacing 2.5 |
| |
| // Minimum spacing between characters within a word, as a fraction of |
| // the font size. |
| #define minCharSpacing -0.5 |
| |
| // Maximum spacing between characters within a word, as a fraction of |
| // the font size, when there is no obvious extra-wide character |
| // spacing. |
| #define maxCharSpacing 0.03 |
| |
| // When extra-wide character spacing is detected, the inter-character |
| // space threshold is set to the minimum inter-character space |
| // multiplied by this constant. |
| #define maxWideCharSpacingMul 1.3 |
| |
| // Upper limit on spacing between characters in a word. |
| #define maxWideCharSpacing 0.4 |
| |
| // Max difference in primary,secondary coordinates (as a fraction of |
| // the font size) allowed for duplicated text (fake boldface, drop |
| // shadows) which is to be discarded. |
| #define dupMaxPriDelta 0.1 |
| #define dupMaxSecDelta 0.2 |
| |
| // Max width of underlines (in points). |
| #define maxUnderlineWidth 3 |
| |
| // Min distance between baseline and underline (in points). |
| //~ this should be font-size-dependent |
| #define minUnderlineGap -2 |
| |
| // Max distance between baseline and underline (in points). |
| //~ this should be font-size-dependent |
| #define maxUnderlineGap 4 |
| |
| // Max horizontal distance between edge of word and start of underline |
| // (in points). |
| //~ this should be font-size-dependent |
| #define underlineSlack 1 |
| |
| // Max distance between edge of text and edge of link border |
| #define hyperlinkSlack 2 |
| |
| // Max distance between characters when combining a base character and |
| // combining character |
| #define combMaxMidDelta 0.3 |
| #define combMaxBaseDelta 0.4 |
| |
| namespace { |
| |
| inline bool isAscii7 (Unicode uchar) { |
| return uchar < 128; |
| } |
| |
| } |
| |
| static int reorderText(Unicode *text, int len, UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode* u) { |
| char lre[8], rle[8], popdf[8], buf[8]; |
| int lreLen = 0, rleLen = 0, popdfLen = 0, n; |
| int nCols, i, j, k; |
| |
| nCols = 0; |
| |
| if (s) { |
| lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); |
| rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle)); |
| popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf)); |
| } |
| |
| if (primaryLR) { |
| i = 0; |
| while (i < len) { |
| // output a left-to-right section |
| for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; |
| for (k = i; k < j; ++k) { |
| if (s) { |
| n = uMap->mapUnicode(text[k], buf, sizeof(buf)); |
| s->append(buf, n); |
| } |
| if (u) u[nCols] = text[k]; |
| ++nCols; |
| } |
| i = j; |
| // output a right-to-left section |
| for (j = i; |
| j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); |
| ++j) ; |
| if (j > i) { |
| if (s) s->append(rle, rleLen); |
| for (k = j - 1; k >= i; --k) { |
| if (s) { |
| n = uMap->mapUnicode(text[k], buf, sizeof(buf)); |
| s->append(buf, n); |
| } |
| if (u) u[nCols] = text[k]; |
| ++nCols; |
| } |
| if (s) s->append(popdf, popdfLen); |
| i = j; |
| } |
| } |
| } else { |
| // Note: This code treats numeric characters (European and |
| // Arabic/Indic) as left-to-right, which isn't strictly correct |
| // (incurs extra LRE/POPDF pairs), but does produce correct |
| // visual formatting. |
| if (s) s->append(rle, rleLen); |
| i = len - 1; |
| while (i >= 0) { |
| // output a right-to-left section |
| for (j = i; |
| j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); |
| --j) ; |
| for (k = i; k > j; --k) { |
| if (s) { |
| n = uMap->mapUnicode(text[k], buf, sizeof(buf)); |
| s->append(buf, n); |
| } |
| if (u) u[nCols] = text[k]; |
| ++nCols; |
| } |
| i = j; |
| // output a left-to-right section |
| for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; |
| if (j < i) { |
| if (s) s->append(lre, lreLen); |
| for (k = j + 1; k <= i; ++k) { |
| if (s) { |
| n = uMap->mapUnicode(text[k], buf, sizeof(buf)); |
| s->append(buf, n); |
| } |
| if (u) u[nCols] = text[k]; |
| ++nCols; |
| } |
| if (s) s->append(popdf, popdfLen); |
| i = j; |
| } |
| } |
| if (s) s->append(popdf, popdfLen); |
| } |
| |
| return nCols; |
| } |
| |
| //------------------------------------------------------------------------ |
| // TextUnderline |
| //------------------------------------------------------------------------ |
| |
| class TextUnderline { |
| public: |
| |
| TextUnderline(double x0A, double y0A, double x1A, double y1A) |
| { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; } |
| ~TextUnderline() {} |
| |
| double x0, y0, x1, y1; |
| bool horiz; |
| }; |
| |
| //------------------------------------------------------------------------ |
| // TextLink |
| //------------------------------------------------------------------------ |
| |
| class TextLink { |
| public: |
| |
| TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA) |
| { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; link = linkA; } |
| ~TextLink() {} |
| |
| int xMin, yMin, xMax, yMax; |
| AnnotLink *link; |
| }; |
| |
| //------------------------------------------------------------------------ |
| // TextFontInfo |
| //------------------------------------------------------------------------ |
| |
| TextFontInfo::TextFontInfo(GfxState *state) { |
| gfxFont = state->getFont(); |
| if (gfxFont) |
| gfxFont->incRefCnt(); |
| #ifdef TEXTOUT_WORD_LIST |
| fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy() |
| : nullptr; |
| flags = gfxFont ? gfxFont->getFlags() : 0; |
| #endif |
| } |
| |
| TextFontInfo::~TextFontInfo() { |
| if (gfxFont) |
| gfxFont->decRefCnt(); |
| #ifdef TEXTOUT_WORD_LIST |
| if (fontName) { |
| delete fontName; |
| } |
| #endif |
| } |
| |
| bool TextFontInfo::matches(GfxState *state) const { |
| return state->getFont() == gfxFont; |
| } |
| |
| bool TextFontInfo::matches(const TextFontInfo *fontInfo) const { |
| return gfxFont == fontInfo->gfxFont; |
| } |
| |
| double TextFontInfo::getAscent() const { |
| return gfxFont ? gfxFont->getAscent() : 0.95; |
| } |
| |
| double TextFontInfo::getDescent() const { |
| return gfxFont ? gfxFont->getDescent() : -0.35; |
| } |
| |
| int TextFontInfo::getWMode() const { |
| return gfxFont ? gfxFont->getWMode() : 0; |
| } |
| |
| //------------------------------------------------------------------------ |
| // TextWord |
| //------------------------------------------------------------------------ |
| |
| TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA) { |
| rot = rotA; |
| fontSize = fontSizeA; |
| text = nullptr; |
| charcode = nullptr; |
| edge = nullptr; |
| charPos = nullptr; |
| font = nullptr; |
| textMat = nullptr; |
| len = size = 0; |
| spaceAfter = false; |
| next = nullptr; |
| |
| #ifdef TEXTOUT_WORD_LIST |
| GfxRGB rgb; |
| |
| if ((state->getRender() & 3) == 1) { |
| state->getStrokeRGB(&rgb); |
| } else { |
| state->getFillRGB(&rgb); |
| } |
| colorR = colToDbl(rgb.r); |
| colorG = colToDbl(rgb.g); |
| colorB = colToDbl(rgb.b); |
| #endif |
| |
| underlined = false; |
| link = nullptr; |
| } |
| |
| TextWord::~TextWord() { |
| gfree(text); |
| gfree(charcode); |
| gfree(edge); |
| gfree(charPos); |
| gfree(font); |
| gfree(textMat); |
| } |
| |
| void TextWord::addChar(GfxState *state, TextFontInfo *fontA, double x, double y, |
| double dx, double dy, int charPosA, int charLen, |
| CharCode c, Unicode u, const Matrix &textMatA) { |
| ensureCapacity(len+1); |
| text[len] = u; |
| charcode[len] = c; |
| charPos[len] = charPosA; |
| charPos[len + 1] = charPosA + charLen; |
| font[len] = fontA; |
| textMat[len] = textMatA; |
| |
| if (len == 0) |
| setInitialBounds(fontA, x, y); |
| |
| if (wMode) { // vertical writing mode |
| // NB: the rotation value has been incremented by 1 (in |
| // TextPage::beginWord()) for vertical writing mode |
| switch (rot) { |
| case 0: |
| edge[len] = x - fontSize; |
| xMax = edge[len+1] = x; |
| break; |
| case 1: |
| edge[len] = y - fontSize; |
| yMax = edge[len+1] = y; |
| break; |
| case 2: |
| edge[len] = x + fontSize; |
| xMin = edge[len+1] = x; |
| break; |
| case 3: |
| edge[len] = y + fontSize; |
| yMin = edge[len+1] = y; |
| break; |
| } |
| } else { // horizontal writing mode |
| switch (rot) { |
| case 0: |
| edge[len] = x; |
| xMax = edge[len+1] = x + dx; |
| break; |
| case 1: |
| edge[len] = y; |
| yMax = edge[len+1] = y + dy; |
| break; |
| case 2: |
| edge[len] = x; |
| xMin = edge[len+1] = x + dx; |
| break; |
| case 3: |
| edge[len] = y; |
| yMin = edge[len+1] = y + dy; |
| break; |
| } |
| } |
| ++len; |
| } |
| |
| void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y) { |
| double ascent = fontA->getAscent() * fontSize; |
| double descent = fontA->getDescent() * fontSize; |
| wMode = fontA->getWMode(); |
| |
| if (wMode) { // vertical writing mode |
| // NB: the rotation value has been incremented by 1 (in |
| // TextPage::beginWord()) for vertical writing mode |
| switch (rot) { |
| case 0: |
| xMin = x - fontSize; |
| yMin = y - fontSize; |
| yMax = y; |
| base = y; |
| break; |
| case 1: |
| xMin = x; |
| yMin = y - fontSize; |
| xMax = x + fontSize; |
| base = x; |
| break; |
| case 2: |
| yMin = y; |
| xMax = x + fontSize; |
| yMax = y + fontSize; |
| base = y; |
| break; |
| case 3: |
| xMin = x - fontSize; |
| xMax = x; |
| yMax = y + fontSize; |
| base = x; |
| break; |
| } |
| } else { // horizontal writing mode |
| switch (rot) { |
| case 0: |
| xMin = x; |
| yMin = y - ascent; |
| yMax = y - descent; |
| if (yMin == yMax) { |
| // this is a sanity check for a case that shouldn't happen -- but |
| // if it does happen, we want to avoid dividing by zero later |
| yMin = y; |
| yMax = y + 1; |
| } |
| base = y; |
| break; |
| case 1: |
| xMin = x + descent; |
| yMin = y; |
| xMax = x + ascent; |
| if (xMin == xMax) { |
| // this is a sanity check for a case that shouldn't happen -- but |
| // if it does happen, we want to avoid dividing by zero later |
| xMin = x; |
| xMax = x + 1; |
| } |
| base = x; |
| break; |
| case 2: |
| yMin = y + descent; |
| xMax = x; |
| yMax = y + ascent; |
| if (yMin == yMax) { |
| // this is a sanity check for a case that shouldn't happen -- but |
| // if it does happen, we want to avoid dividing by zero later |
| yMin = y; |
| yMax = y + 1; |
| } |
| base = y; |
| break; |
| case 3: |
| xMin = x - ascent; |
| xMax = x - descent; |
| yMax = y; |
| if (xMin == xMax) { |
| // this is a sanity check for a case that shouldn't happen -- but |
| // if it does happen, we want to avoid dividing by zero later |
| xMin = x; |
| xMax = x + 1; |
| } |
| base = x; |
| break; |
| } |
| } |
| } |
| |
| void TextWord::ensureCapacity(int capacity) { |
| if (capacity > size) { |
| size = std::max(size + 16, capacity); |
| text = (Unicode *)greallocn(text, size, sizeof(Unicode)); |
| charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode)); |
| edge = (double *)greallocn(edge, (size + 1), sizeof(double)); |
| charPos = (int *)greallocn(charPos, size + 1, sizeof(int)); |
| font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *)); |
| textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix)); |
| } |
| } |
| |
| struct CombiningTable { |
| Unicode base; |
| Unicode comb; |
| }; |
| |
| static struct CombiningTable combiningTable[] = { |
| {0x0060, 0x0300}, // grave |
| {0x00a8, 0x0308}, // dieresis |
| {0x00af, 0x0304}, // macron |
| {0x00b4, 0x0301}, // acute |
| {0x00b8, 0x0327}, // cedilla |
| {0x02c6, 0x0302}, // circumflex |
| {0x02c7, 0x030c}, // caron |
| {0x02d8, 0x0306}, // breve |
| {0x02d9, 0x0307}, // dotaccent |
| {0x02da, 0x030a}, // ring |
| {0x02dc, 0x0303}, // tilde |
| {0x02dd, 0x030b} // hungarumlaut (double acute accent) |
| }; |
| |
| // returning combining versions of characters |
| static Unicode getCombiningChar(Unicode u) { |
| int len = sizeof(combiningTable) / sizeof(combiningTable[0]); |
| for (int i = 0; i < len; ++i) { |
| if (u == combiningTable[i].base) |
| return combiningTable[i].comb; |
| } |
| return 0; |
| } |
| |
| bool TextWord::addCombining(GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, |
| double dx, double dy, int charPosA, int charLen, |
| CharCode c, Unicode u, const Matrix &textMatA) { |
| if (len == 0 || wMode != 0 || fontA->getWMode() != 0) |
| return false; |
| |
| Unicode cCurrent = getCombiningChar(u); |
| Unicode cPrev = getCombiningChar(text[len-1]); |
| double edgeMid = (edge[len-1] + edge[len]) / 2; |
| double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta; |
| |
| if (cCurrent != 0 && unicodeTypeAlphaNum(text[len-1])) { |
| // Current is a combining character, previous is base character |
| maxScaledMidDelta = fabs(edge[len] - edge[len-1]) * combMaxMidDelta; |
| charMid = charBase = maxScaledBaseDelta = 0; |
| |
| // Test if characters overlap |
| if (rot == 0 || rot == 2) { |
| charMid = x + (dx / 2); |
| charBase = y; |
| maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta; |
| } else { |
| charMid = y + (dy / 2); |
| charBase = x; |
| maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta; |
| } |
| |
| if (fabs(charMid - edgeMid) >= maxScaledMidDelta || |
| fabs(charBase - base) >= maxScaledBaseDelta) |
| return false; |
| |
| // Add character, but don't adjust edge / bounding box because |
| // combining character's positioning could be odd. |
| ensureCapacity(len+1); |
| text[len] = cCurrent; |
| charcode[len] = c; |
| charPos[len] = charPosA; |
| charPos[len+1] = charPosA + charLen; |
| font[len] = fontA; |
| textMat[len] = textMatA; |
| edge[len+1] = edge[len]; |
| edge[len] = (edge[len+1] + edge[len-1]) / 2; |
| ++len; |
| return true; |
| } |
| |
| if (cPrev != 0 && unicodeTypeAlphaNum(u)) { |
| // Previous is a combining character, current is base character |
| maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta; |
| charMid = charBase = maxScaledMidDelta = 0; |
| |
| // Test if characters overlap |
| if (rot == 0 || rot == 2) { |
| charMid = x + (dx / 2); |
| charBase = y; |
| maxScaledMidDelta = fabs(dx * combMaxMidDelta); |
| } else { |
| charMid = y + (dy / 2); |
| charBase = x; |
| maxScaledMidDelta = fabs(dy * combMaxMidDelta); |
| } |
| |
| if (fabs(charMid - edgeMid) >= maxScaledMidDelta || |
| fabs(charBase - base) >= maxScaledBaseDelta) |
| return false; |
| |
| // move combining character to after base character |
| ensureCapacity(len+1); |
| fontSize = fontSizeA; |
| text[len] = cPrev; |
| charcode[len] = charcode[len-1]; |
| charPos[len] = charPosA; |
| charPos[len+1] = charPosA + charLen; |
| font[len] = font[len-1]; |
| textMat[len] = textMat[len-1]; |
| |
| text[len-1] = u; |
| charcode[len-1] = c; |
| font[len-1] = fontA; |
| textMat[len-1] = textMatA; |
| |
| if (len == 1) |
| setInitialBounds(fontA, x, y); |
| |
| // Updated edges / bounding box because we changed the base |
| // character. |
| if (wMode) { |
| switch (rot) { |
| case 0: |
| edge[len-1] = x - fontSize; |
| xMax = edge[len+1] = x; |
| break; |
| case 1: |
| edge[len-1] = y - fontSize; |
| yMax = edge[len+1] = y; |
| break; |
| case 2: |
| edge[len-1] = x + fontSize; |
| xMin = edge[len+1] = x; |
| break; |
| case 3: |
| edge[len-1] = y + fontSize; |
| yMin = edge[len+1] = y; |
| break; |
| } |
| } else { |
| switch (rot) { |
| case 0: |
| edge[len-1] = x; |
| xMax = edge[len+1] = x + dx; |
| break; |
| case 1: |
| edge[len-1] = y; |
| yMax = edge[len+1] = y + dy; |
| break; |
| case 2: |
| edge[len-1] = x; |
| xMin = edge[len+1] = x + dx; |
| break; |
| case 3: |
| edge[len-1] = y; |
| yMin = edge[len+1] = y + dy; |
| break; |
| } |
| } |
| |
| edge[len] = (edge[len+1] + edge[len-1]) / 2; |
| ++len; |
| return true; |
| } |
| return false; |
| } |
| |
| void TextWord::merge(TextWord *word) { |
| int i; |
| |
| if (word->xMin < xMin) { |
| xMin = word->xMin; |
| } |
| if (word->yMin < yMin) { |
| yMin = word->yMin; |
| } |
| if (word->xMax > xMax) { |
| xMax = word->xMax; |
| } |
| if (word->yMax > yMax) { |
| yMax = word->yMax; |
| } |
| ensureCapacity(len + word->len); |
| for (i = 0; i < word->len; ++i) { |
| text[len + i] = word->text[i]; |
| charcode[len + i] = word->charcode[i]; |
| edge[len + i] = word->edge[i]; |
| charPos[len + i] = word->charPos[i]; |
| font[len + i] = word->font[i]; |
| textMat[len + i] = word->textMat[i]; |
| } |
| edge[len + word->len] = word->edge[word->len]; |
| charPos[len + word->len] = word->charPos[word->len]; |
| len += word->len; |
| } |
| |
| inline int TextWord::primaryCmp(TextWord *word) { |
| double cmp; |
| |
| cmp = 0; // make gcc happy |
| switch (rot) { |
| case 0: |
| cmp = xMin - word->xMin; |
| break; |
| case 1: |
| cmp = yMin - word->yMin; |
| break; |
| case 2: |
| cmp = word->xMax - xMax; |
| break; |
| case 3: |
| cmp = word->yMax - yMax; |
| break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| double TextWord::primaryDelta(TextWord *word) { |
| double delta; |
| |
| delta = 0; // make gcc happy |
| switch (rot) { |
| case 0: |
| delta = word->xMin - xMax; |
| break; |
| case 1: |
| delta = word->yMin - yMax; |
| break; |
| case 2: |
| delta = xMin - word->xMax; |
| break; |
| case 3: |
| delta = yMin - word->yMax; |
| break; |
| } |
| return delta; |
| } |
| |
| int TextWord::cmpYX(const void *p1, const void *p2) { |
| TextWord *word1 = *(TextWord **)p1; |
| TextWord *word2 = *(TextWord **)p2; |
| double cmp; |
| |
| cmp = word1->yMin - word2->yMin; |
| if (cmp == 0) { |
| cmp = word1->xMin - word2->xMin; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| #ifdef TEXTOUT_WORD_LIST |
| |
| GooString *TextWord::getText() { |
| GooString *s; |
| UnicodeMap *uMap; |
| char buf[8]; |
| int n, i; |
| |
| s = new GooString(); |
| if (!(uMap = globalParams->getTextEncoding())) { |
| return s; |
| } |
| for (i = 0; i < len; ++i) { |
| n = uMap->mapUnicode(text[i], buf, sizeof(buf)); |
| s->append(buf, n); |
| } |
| uMap->decRefCnt(); |
| return s; |
| } |
| |
| void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, |
| double *xMaxA, double *yMaxA) { |
| if (charIdx < 0 || charIdx >= len) { |
| return; |
| } |
| switch (rot) { |
| case 0: |
| *xMinA = edge[charIdx]; |
| *xMaxA = edge[charIdx + 1]; |
| *yMinA = yMin; |
| *yMaxA = yMax; |
| break; |
| case 1: |
| *xMinA = xMin; |
| *xMaxA = xMax; |
| *yMinA = edge[charIdx]; |
| *yMaxA = edge[charIdx + 1]; |
| break; |
| case 2: |
| *xMinA = edge[charIdx + 1]; |
| *xMaxA = edge[charIdx]; |
| *yMinA = yMin; |
| *yMaxA = yMax; |
| break; |
| case 3: |
| *xMinA = xMin; |
| *xMaxA = xMax; |
| *yMinA = edge[charIdx + 1]; |
| *yMaxA = edge[charIdx]; |
| break; |
| } |
| } |
| |
| #endif // TEXTOUT_WORD_LIST |
| |
| //------------------------------------------------------------------------ |
| // TextPool |
| //------------------------------------------------------------------------ |
| |
| TextPool::TextPool() { |
| minBaseIdx = 0; |
| maxBaseIdx = -1; |
| pool = nullptr; |
| cursor = nullptr; |
| cursorBaseIdx = -1; |
| } |
| |
| TextPool::~TextPool() { |
| int baseIdx; |
| TextWord *word, *word2; |
| |
| for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { |
| for (word = pool[baseIdx - minBaseIdx]; word; word = word2) { |
| word2 = word->next; |
| delete word; |
| } |
| } |
| gfree(pool); |
| } |
| |
| int TextPool::getBaseIdx(double base) { |
| const double baseIdxDouble = base / textPoolStep; |
| if (baseIdxDouble < minBaseIdx) { |
| return minBaseIdx; |
| } |
| if (baseIdxDouble > maxBaseIdx) { |
| return maxBaseIdx; |
| } |
| return (int)baseIdxDouble; |
| } |
| |
| void TextPool::addWord(TextWord *word) { |
| int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx; |
| TextWord *w0, *w1; |
| |
| // expand the array if needed |
| wordBaseIdx = (int)(word->base / textPoolStep); |
| if (unlikely(wordBaseIdx <= INT_MIN + 128 || wordBaseIdx >= INT_MAX - 128)) { |
| error(errSyntaxWarning, -1, "wordBaseIdx out of range"); |
| delete word; |
| return; |
| } |
| if (minBaseIdx > maxBaseIdx) { |
| minBaseIdx = wordBaseIdx - 128; |
| maxBaseIdx = wordBaseIdx + 128; |
| pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1, |
| sizeof(TextWord *)); |
| for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { |
| pool[baseIdx - minBaseIdx] = nullptr; |
| } |
| } else if (wordBaseIdx < minBaseIdx) { |
| newMinBaseIdx = wordBaseIdx - 128; |
| TextWord **newPool = (TextWord **)gmallocn_checkoverflow(maxBaseIdx - newMinBaseIdx + 1, |
| sizeof(TextWord *)); |
| if (unlikely(!newPool)) { |
| error(errSyntaxWarning, -1, "newPool would overflow"); |
| delete word; |
| return; |
| } |
| for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) { |
| newPool[baseIdx - newMinBaseIdx] = nullptr; |
| } |
| memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool, |
| (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *)); |
| gfree(pool); |
| pool = newPool; |
| minBaseIdx = newMinBaseIdx; |
| } else if (wordBaseIdx > maxBaseIdx) { |
| newMaxBaseIdx = wordBaseIdx + 128; |
| TextWord **reallocatedPool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1, |
| sizeof(TextWord *), true /*checkoverflow*/, false /*free_pool*/); |
| if (!reallocatedPool) { |
| error(errSyntaxWarning, -1, "new pool size would overflow"); |
| delete word; |
| return; |
| } |
| pool = reallocatedPool; |
| for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) { |
| pool[baseIdx - minBaseIdx] = nullptr; |
| } |
| maxBaseIdx = newMaxBaseIdx; |
| } |
| |
| // insert the new word |
| if (cursor && wordBaseIdx == cursorBaseIdx && |
| word->primaryCmp(cursor) >= 0) { |
| w0 = cursor; |
| w1 = cursor->next; |
| } else { |
| w0 = nullptr; |
| w1 = pool[wordBaseIdx - minBaseIdx]; |
| } |
| for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ; |
| word->next = w1; |
| if (w0) { |
| w0->next = word; |
| } else { |
| pool[wordBaseIdx - minBaseIdx] = word; |
| } |
| cursor = word; |
| cursorBaseIdx = wordBaseIdx; |
| } |
| |
| //------------------------------------------------------------------------ |
| // TextLine |
| //------------------------------------------------------------------------ |
| |
| TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) { |
| blk = blkA; |
| rot = rotA; |
| base = baseA; |
| words = lastWord = nullptr; |
| text = nullptr; |
| edge = nullptr; |
| col = nullptr; |
| len = 0; |
| convertedLen = 0; |
| hyphenated = false; |
| next = nullptr; |
| xMin = yMin = 0; |
| xMax = yMax = -1; |
| normalized = nullptr; |
| normalized_len = 0; |
| normalized_idx = nullptr; |
| ascii_translation = nullptr; |
| ascii_len = 0; |
| ascii_idx = nullptr; |
| } |
| |
| TextLine::~TextLine() { |
| TextWord *word; |
| |
| while (words) { |
| word = words; |
| words = words->next; |
| delete word; |
| } |
| gfree(text); |
| gfree(edge); |
| gfree(col); |
| if (normalized) { |
| gfree(normalized); |
| gfree(normalized_idx); |
| } |
| if (ascii_translation) { |
| gfree(ascii_translation); |
| gfree(ascii_idx); |
| } |
| } |
| |
| void TextLine::addWord(TextWord *word) { |
| if (lastWord) { |
| lastWord->next = word; |
| } else { |
| words = word; |
| } |
| lastWord = word; |
| |
| if (xMin > xMax) { |
| xMin = word->xMin; |
| xMax = word->xMax; |
| yMin = word->yMin; |
| yMax = word->yMax; |
| } else { |
| if (word->xMin < xMin) { |
| xMin = word->xMin; |
| } |
| if (word->xMax > xMax) { |
| xMax = word->xMax; |
| } |
| if (word->yMin < yMin) { |
| yMin = word->yMin; |
| } |
| if (word->yMax > yMax) { |
| yMax = word->yMax; |
| } |
| } |
| } |
| |
| double TextLine::primaryDelta(TextLine *line) { |
| double delta; |
| |
| delta = 0; // make gcc happy |
| switch (rot) { |
| case 0: |
| delta = line->xMin - xMax; |
| break; |
| case 1: |
| delta = line->yMin - yMax; |
| break; |
| case 2: |
| delta = xMin - line->xMax; |
| break; |
| case 3: |
| delta = yMin - line->yMax; |
| break; |
| } |
| return delta; |
| } |
| |
| int TextLine::primaryCmp(TextLine *line) { |
| double cmp; |
| |
| cmp = 0; // make gcc happy |
| switch (rot) { |
| case 0: |
| cmp = xMin - line->xMin; |
| break; |
| case 1: |
| cmp = yMin - line->yMin; |
| break; |
| case 2: |
| cmp = line->xMax - xMax; |
| break; |
| case 3: |
| cmp = line->yMax - yMax; |
| break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| int TextLine::secondaryCmp(TextLine *line) { |
| double cmp; |
| |
| cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base; |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| int TextLine::cmpYX(TextLine *line) { |
| int cmp; |
| |
| if ((cmp = secondaryCmp(line))) { |
| return cmp; |
| } |
| return primaryCmp(line); |
| } |
| |
| int TextLine::cmpXY(const void *p1, const void *p2) { |
| TextLine *line1 = *(TextLine **)p1; |
| TextLine *line2 = *(TextLine **)p2; |
| int cmp; |
| |
| if ((cmp = line1->primaryCmp(line2))) { |
| return cmp; |
| } |
| return line1->secondaryCmp(line2); |
| } |
| |
| void TextLine::coalesce(UnicodeMap *uMap) { |
| TextWord *word0, *word1; |
| double space, delta, minSpace; |
| bool isUnicode; |
| char buf[8]; |
| int i, j; |
| |
| if (words->next) { |
| |
| // compute the inter-word space threshold |
| if (words->len > 1 || words->next->len > 1) { |
| minSpace = 0; |
| } else { |
| minSpace = words->primaryDelta(words->next); |
| for (word0 = words->next, word1 = word0->next; |
| word1 && minSpace > 0; |
| word0 = word1, word1 = word0->next) { |
| if (word1->len > 1) { |
| minSpace = 0; |
| } |
| delta = word0->primaryDelta(word1); |
| if (delta < minSpace) { |
| minSpace = delta; |
| } |
| } |
| } |
| if (minSpace <= 0) { |
| space = maxCharSpacing * words->fontSize; |
| } else { |
| space = maxWideCharSpacingMul * minSpace; |
| if (space > maxWideCharSpacing * words->fontSize) { |
| space = maxWideCharSpacing * words->fontSize; |
| } |
| } |
| |
| // merge words |
| word0 = words; |
| word1 = words->next; |
| while (word1) { |
| if (word0->primaryDelta(word1) >= space) { |
| word0->spaceAfter = true; |
| word0 = word1; |
| word1 = word1->next; |
| } else if (word0->font[word0->len - 1] == word1->font[0] && |
| word0->underlined == word1->underlined && |
| fabs(word0->fontSize - word1->fontSize) < |
| maxWordFontSizeDelta * words->fontSize && |
| word1->charPos[0] == word0->charPos[word0->len]) { |
| word0->merge(word1); |
| word0->next = word1->next; |
| delete word1; |
| word1 = word0->next; |
| } else { |
| word0 = word1; |
| word1 = word1->next; |
| } |
| } |
| } |
| |
| // build the line text |
| isUnicode = uMap ? uMap->isUnicode() : false; |
| len = 0; |
| for (word1 = words; word1; word1 = word1->next) { |
| len += word1->len; |
| if (word1->spaceAfter) { |
| ++len; |
| } |
| } |
| text = (Unicode *)gmallocn(len, sizeof(Unicode)); |
| edge = (double *)gmallocn(len + 1, sizeof(double)); |
| i = 0; |
| for (word1 = words; word1; word1 = word1->next) { |
| for (j = 0; j < word1->len; ++j) { |
| text[i] = word1->text[j]; |
| edge[i] = word1->edge[j]; |
| ++i; |
| } |
| edge[i] = word1->edge[word1->len]; |
| if (word1->spaceAfter) { |
| text[i] = (Unicode)0x0020; |
| ++i; |
| } |
| } |
| |
| // compute convertedLen and set up the col array |
| col = (int *)gmallocn(len + 1, sizeof(int)); |
| convertedLen = 0; |
| for (i = 0; i < len; ++i) { |
| col[i] = convertedLen; |
| if (isUnicode) { |
| ++convertedLen; |
| } else if (uMap) { |
| convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf)); |
| } |
| } |
| col[len] = convertedLen; |
| |
| // check for hyphen at end of line |
| //~ need to check for other chars used as hyphens |
| hyphenated = text[len - 1] == (Unicode)'-'; |
| } |
| |
| //------------------------------------------------------------------------ |
| // TextLineFrag |
| //------------------------------------------------------------------------ |
| |
| class TextLineFrag { |
| public: |
| |
| TextLine *line; // the line object |
| int start, len; // offset and length of this fragment |
| // (in Unicode chars) |
| double xMin, xMax; // bounding box coordinates |
| double yMin, yMax; |
| double base; // baseline virtual coordinate |
| int col; // first column |
| |
| void init(TextLine *lineA, int startA, int lenA); |
| void computeCoords(bool oneRot); |
| |
| static int cmpYXPrimaryRot(const void *p1, const void *p2); |
| static int cmpYXLineRot(const void *p1, const void *p2); |
| static int cmpXYLineRot(const void *p1, const void *p2); |
| static int cmpXYColumnPrimaryRot(const void *p1, const void *p2); |
| static int cmpXYColumnLineRot(const void *p1, const void *p2); |
| }; |
| |
| void TextLineFrag::init(TextLine *lineA, int startA, int lenA) { |
| line = lineA; |
| start = startA; |
| len = lenA; |
| col = line->col[start]; |
| } |
| |
| void TextLineFrag::computeCoords(bool oneRot) { |
| TextBlock *blk; |
| double d0, d1, d2, d3, d4; |
| |
| if (oneRot) { |
| |
| switch (line->rot) { |
| case 0: |
| xMin = line->edge[start]; |
| xMax = line->edge[start + len]; |
| yMin = line->yMin; |
| yMax = line->yMax; |
| break; |
| case 1: |
| xMin = line->xMin; |
| xMax = line->xMax; |
| yMin = line->edge[start]; |
| yMax = line->edge[start + len]; |
| break; |
| case 2: |
| xMin = line->edge[start + len]; |
| xMax = line->edge[start]; |
| yMin = line->yMin; |
| yMax = line->yMax; |
| break; |
| case 3: |
| xMin = line->xMin; |
| xMax = line->xMax; |
| yMin = line->edge[start + len]; |
| yMax = line->edge[start]; |
| break; |
| } |
| base = line->base; |
| |
| } else { |
| |
| if (line->rot == 0 && line->blk->page->primaryRot == 0) { |
| |
| xMin = line->edge[start]; |
| xMax = line->edge[start + len]; |
| yMin = line->yMin; |
| yMax = line->yMax; |
| base = line->base; |
| |
| } else { |
| |
| blk = line->blk; |
| d0 = line->edge[start]; |
| d1 = line->edge[start + len]; |
| d2 = d3 = d4 = 0; // make gcc happy |
| |
| switch (line->rot) { |
| case 0: |
| d2 = line->yMin; |
| d3 = line->yMax; |
| d4 = line->base; |
| d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin); |
| d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin); |
| d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin); |
| d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin); |
| d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin); |
| break; |
| case 1: |
| d2 = line->xMax; |
| d3 = line->xMin; |
| d4 = line->base; |
| d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin); |
| d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin); |
| d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin); |
| d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin); |
| d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin); |
| break; |
| case 2: |
| d2 = line->yMax; |
| d3 = line->yMin; |
| d4 = line->base; |
| d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin); |
| d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin); |
| d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin); |
| d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin); |
| d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin); |
| break; |
| case 3: |
| d2 = line->xMin; |
| d3 = line->xMax; |
| d4 = line->base; |
| d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin); |
| d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin); |
| d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin); |
| d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin); |
| d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin); |
| break; |
| } |
| |
| switch (line->blk->page->primaryRot) { |
| case 0: |
| xMin = blk->xMin + d0 * (blk->xMax - blk->xMin); |
| xMax = blk->xMin + d1 * (blk->xMax - blk->xMin); |
| yMin = blk->yMin + d2 * (blk->yMax - blk->yMin); |
| yMax = blk->yMin + d3 * (blk->yMax - blk->yMin); |
| base = blk->yMin + d4 * (blk->yMax - blk->yMin); |
| break; |
| case 1: |
| xMin = blk->xMax - d3 * (blk->xMax - blk->xMin); |
| xMax = blk->xMax - d2 * (blk->xMax - blk->xMin); |
| yMin = blk->yMin + d0 * (blk->yMax - blk->yMin); |
| yMax = blk->yMin + d1 * (blk->yMax - blk->yMin); |
| base = blk->xMax - d4 * (blk->xMax - blk->xMin); |
| break; |
| case 2: |
| xMin = blk->xMax - d1 * (blk->xMax - blk->xMin); |
| xMax = blk->xMax - d0 * (blk->xMax - blk->xMin); |
| yMin = blk->yMax - d3 * (blk->yMax - blk->yMin); |
| yMax = blk->yMax - d2 * (blk->yMax - blk->yMin); |
| base = blk->yMax - d4 * (blk->yMax - blk->yMin); |
| break; |
| case 3: |
| xMin = blk->xMin + d2 * (blk->xMax - blk->xMin); |
| xMax = blk->xMin + d3 * (blk->xMax - blk->xMin); |
| yMin = blk->yMax - d1 * (blk->yMax - blk->yMin); |
| yMax = blk->yMax - d0 * (blk->yMax - blk->yMin); |
| base = blk->xMin + d4 * (blk->xMax - blk->xMin); |
| break; |
| } |
| |
| } |
| } |
| } |
| |
| int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) { |
| TextLineFrag *frag1 = (TextLineFrag *)p1; |
| TextLineFrag *frag2 = (TextLineFrag *)p2; |
| double cmp; |
| |
| cmp = 0; // make gcc happy |
| switch (frag1->line->blk->page->primaryRot) { |
| case 0: |
| if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) { |
| cmp = frag1->xMin - frag2->xMin; |
| } |
| break; |
| case 1: |
| if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) { |
| cmp = frag1->yMin - frag2->yMin; |
| } |
| break; |
| case 2: |
| if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) { |
| cmp = frag2->xMax - frag1->xMax; |
| } |
| break; |
| case 3: |
| if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) { |
| cmp = frag2->yMax - frag1->yMax; |
| } |
| break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) { |
| TextLineFrag *frag1 = (TextLineFrag *)p1; |
| TextLineFrag *frag2 = (TextLineFrag *)p2; |
| double cmp; |
| |
| cmp = 0; // make gcc happy |
| switch (frag1->line->rot) { |
| case 0: |
| if ((cmp = frag1->yMin - frag2->yMin) == 0) { |
| cmp = frag1->xMin - frag2->xMin; |
| } |
| break; |
| case 1: |
| if ((cmp = frag2->xMax - frag1->xMax) == 0) { |
| cmp = frag1->yMin - frag2->yMin; |
| } |
| break; |
| case 2: |
| if ((cmp = frag2->yMin - frag1->yMin) == 0) { |
| cmp = frag2->xMax - frag1->xMax; |
| } |
| break; |
| case 3: |
| if ((cmp = frag1->xMax - frag2->xMax) == 0) { |
| cmp = frag2->yMax - frag1->yMax; |
| } |
| break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) { |
| TextLineFrag *frag1 = (TextLineFrag *)p1; |
| TextLineFrag *frag2 = (TextLineFrag *)p2; |
| double cmp; |
| |
| cmp = 0; // make gcc happy |
| switch (frag1->line->rot) { |
| case 0: |
| if ((cmp = frag1->xMin - frag2->xMin) == 0) { |
| cmp = frag1->yMin - frag2->yMin; |
| } |
| break; |
| case 1: |
| if ((cmp = frag1->yMin - frag2->yMin) == 0) { |
| cmp = frag2->xMax - frag1->xMax; |
| } |
| break; |
| case 2: |
| if ((cmp = frag2->xMax - frag1->xMax) == 0) { |
| cmp = frag2->yMin - frag1->yMin; |
| } |
| break; |
| case 3: |
| if ((cmp = frag2->yMax - frag1->yMax) == 0) { |
| cmp = frag1->xMax - frag2->xMax; |
| } |
| break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) { |
| TextLineFrag *frag1 = (TextLineFrag *)p1; |
| TextLineFrag *frag2 = (TextLineFrag *)p2; |
| double cmp; |
| |
| // if columns overlap, compare y values |
| if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - |
| frag2->line->col[frag2->start]) && |
| frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - |
| frag1->line->col[frag1->start])) { |
| cmp = 0; // make gcc happy |
| switch (frag1->line->blk->page->primaryRot) { |
| case 0: cmp = frag1->yMin - frag2->yMin; break; |
| case 1: cmp = frag2->xMax - frag1->xMax; break; |
| case 2: cmp = frag2->yMin - frag1->yMin; break; |
| case 3: cmp = frag1->xMax - frag2->xMax; break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| // otherwise, compare starting column |
| return frag1->col - frag2->col; |
| } |
| |
| int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) { |
| TextLineFrag *frag1 = (TextLineFrag *)p1; |
| TextLineFrag *frag2 = (TextLineFrag *)p2; |
| double cmp; |
| |
| // if columns overlap, compare y values |
| if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - |
| frag2->line->col[frag2->start]) && |
| frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - |
| frag1->line->col[frag1->start])) { |
| cmp = 0; // make gcc happy |
| switch (frag1->line->rot) { |
| case 0: cmp = frag1->yMin - frag2->yMin; break; |
| case 1: cmp = frag2->xMax - frag1->xMax; break; |
| case 2: cmp = frag2->yMin - frag1->yMin; break; |
| case 3: cmp = frag1->xMax - frag2->xMax; break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| // otherwise, compare starting column |
| return frag1->col - frag2->col; |
| } |
| |
| //------------------------------------------------------------------------ |
| // TextBlock |
| //------------------------------------------------------------------------ |
| |
| TextBlock::TextBlock(TextPage *pageA, int rotA) { |
| page = pageA; |
| rot = rotA; |
| xMin = yMin = 0; |
| xMax = yMax = -1; |
| priMin = 0; |
| priMax = page->pageWidth; |
| pool = new TextPool(); |
| lines = nullptr; |
| curLine = nullptr; |
| next = nullptr; |
| stackNext = nullptr; |
| tableId = -1; |
| tableEnd = false; |
| } |
| |
| TextBlock::~TextBlock() { |
| TextLine *line; |
| |
| delete pool; |
| while (lines) { |
| line = lines; |
| lines = lines->next; |
| delete line; |
| } |
| } |
| |
| void TextBlock::addWord(TextWord *word) { |
| pool->addWord(word); |
| if (xMin > xMax) { |
| xMin = word->xMin; |
| xMax = word->xMax; |
| yMin = word->yMin; |
| yMax = word->yMax; |
| } else { |
| if (word->xMin < xMin) { |
| xMin = word->xMin; |
| } |
| if (word->xMax > xMax) { |
| xMax = word->xMax; |
| } |
| if (word->yMin < yMin) { |
| yMin = word->yMin; |
| } |
| if (word->yMax > yMax) { |
| yMax = word->yMax; |
| } |
| } |
| } |
| |
| void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { |
| TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord; |
| TextLine *line, *line0, *line1; |
| int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx; |
| int baseIdx, bestWordBaseIdx, idx0, idx1; |
| double minBase, maxBase; |
| double fontSize, wordSpacing, delta, priDelta, secDelta; |
| TextLine **lineArray; |
| bool found, overlap; |
| int col1, col2; |
| int i, j, k; |
| |
| // discard duplicated text (fake boldface, drop shadows) |
| for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) { |
| word0 = pool->getPool(idx0); |
| while (word0) { |
| priDelta = dupMaxPriDelta * word0->fontSize; |
| secDelta = dupMaxSecDelta * word0->fontSize; |
| maxBaseIdx = pool->getBaseIdx(word0->base + secDelta); |
| found = false; |
| word1 = word2 = nullptr; // make gcc happy |
| for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) { |
| if (idx1 == idx0) { |
| word1 = word0; |
| word2 = word0->next; |
| } else { |
| word1 = nullptr; |
| word2 = pool->getPool(idx1); |
| } |
| for (; word2; word1 = word2, word2 = word2->next) { |
| if (word2->len == word0->len && |
| !memcmp(word2->text, word0->text, |
| word0->len * sizeof(Unicode))) { |
| switch (rot) { |
| case 0: |
| case 2: |
| found = fabs(word0->xMin - word2->xMin) < priDelta && |
| fabs(word0->xMax - word2->xMax) < priDelta && |
| fabs(word0->yMin - word2->yMin) < secDelta && |
| fabs(word0->yMax - word2->yMax) < secDelta; |
| break; |
| case 1: |
| case 3: |
| found = fabs(word0->xMin - word2->xMin) < secDelta && |
| fabs(word0->xMax - word2->xMax) < secDelta && |
| fabs(word0->yMin - word2->yMin) < priDelta && |
| fabs(word0->yMax - word2->yMax) < priDelta; |
| break; |
| } |
| } |
| if (found) { |
| break; |
| } |
| } |
| if (found) { |
| break; |
| } |
| } |
| if (found) { |
| if (word1) { |
| word1->next = word2->next; |
| } else { |
| pool->setPool(idx1, word2->next); |
| } |
| delete word2; |
| } else { |
| word0 = word0->next; |
| } |
| } |
| } |
| |
| // build the lines |
| curLine = nullptr; |
| poolMinBaseIdx = pool->minBaseIdx; |
| charCount = 0; |
| nLines = 0; |
| while (1) { |
| |
| // find the first non-empty line in the pool |
| for (; |
| poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); |
| ++poolMinBaseIdx) ; |
| if (poolMinBaseIdx > pool->maxBaseIdx) { |
| break; |
| } |
| |
| // look for the left-most word in the first four lines of the |
| // pool -- this avoids starting with a superscript word |
| startBaseIdx = poolMinBaseIdx; |
| for (baseIdx = poolMinBaseIdx + 1; |
| baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; |
| ++baseIdx) { |
| if (!pool->getPool(baseIdx)) { |
| continue; |
| } |
| if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) |
| < 0) { |
| startBaseIdx = baseIdx; |
| } |
| } |
| |
| // create a new line |
| word0 = pool->getPool(startBaseIdx); |
| pool->setPool(startBaseIdx, word0->next); |
| word0->next = nullptr; |
| line = new TextLine(this, word0->rot, word0->base); |
| line->addWord(word0); |
| lastWord = word0; |
| |
| // compute the search range |
| fontSize = word0->fontSize; |
| minBase = word0->base - maxIntraLineDelta * fontSize; |
| maxBase = word0->base + maxIntraLineDelta * fontSize; |
| minBaseIdx = pool->getBaseIdx(minBase); |
| maxBaseIdx = pool->getBaseIdx(maxBase); |
| wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize; |
| |
| // find the rest of the words in this line |
| while (1) { |
| |
| // find the left-most word whose baseline is in the range for |
| // this line |
| bestWordBaseIdx = 0; |
| bestWord0 = bestWord1 = nullptr; |
| overlap = false; |
| for (baseIdx = minBaseIdx; |
| !overlap && baseIdx <= maxBaseIdx; |
| ++baseIdx) { |
| for (word0 = nullptr, word1 = pool->getPool(baseIdx); |
| word1; |
| word0 = word1, word1 = word1->next) { |
| if (word1->base >= minBase && |
| word1->base <= maxBase) { |
| delta = lastWord->primaryDelta(word1); |
| if (delta < minCharSpacing * fontSize) { |
| overlap = true; |
| break; |
| } else { |
| if (delta < wordSpacing && |
| (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) { |
| bestWordBaseIdx = baseIdx; |
| bestWord0 = word0; |
| bestWord1 = word1; |
| } |
| break; |
| } |
| } |
| } |
| } |
| if (overlap || !bestWord1) { |
| break; |
| } |
| |
| // remove it from the pool, and add it to the line |
| if (bestWord0) { |
| bestWord0->next = bestWord1->next; |
| } else { |
| pool->setPool(bestWordBaseIdx, bestWord1->next); |
| } |
| bestWord1->next = nullptr; |
| line->addWord(bestWord1); |
| lastWord = bestWord1; |
| } |
| |
| // add the line |
| if (curLine && line->cmpYX(curLine) > 0) { |
| line0 = curLine; |
| line1 = curLine->next; |
| } else { |
| line0 = nullptr; |
| line1 = lines; |
| } |
| for (; |
| line1 && line->cmpYX(line1) > 0; |
| line0 = line1, line1 = line1->next) ; |
| if (line0) { |
| line0->next = line; |
| } else { |
| lines = line; |
| } |
| line->next = line1; |
| curLine = line; |
| line->coalesce(uMap); |
| charCount += line->len; |
| ++nLines; |
| } |
| |
| // sort lines into xy order for column assignment |
| lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *)); |
| for (line = lines, i = 0; line; line = line->next, ++i) { |
| lineArray[i] = line; |
| } |
| qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY); |
| |
| // column assignment |
| nColumns = 0; |
| if (fixedPitch) { |
| for (i = 0; i < nLines; ++i) { |
| line0 = lineArray[i]; |
| col1 = 0; // make gcc happy |
| switch (rot) { |
| case 0: |
| col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5); |
| break; |
| case 1: |
| col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5); |
| break; |
| case 2: |
| col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5); |
| break; |
| case 3: |
| col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5); |
| break; |
| } |
| for (k = 0; k <= line0->len; ++k) { |
| line0->col[k] += col1; |
| } |
| if (line0->col[line0->len] > nColumns) { |
| nColumns = line0->col[line0->len]; |
| } |
| } |
| } else { |
| for (i = 0; i < nLines; ++i) { |
| line0 = lineArray[i]; |
| col1 = 0; |
| for (j = 0; j < i; ++j) { |
| line1 = lineArray[j]; |
| if (line1->primaryDelta(line0) >= 0) { |
| col2 = line1->col[line1->len] + 1; |
| } else { |
| k = 0; // make gcc happy |
| switch (rot) { |
| case 0: |
| for (k = 0; |
| k < line1->len && |
| line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); |
| ++k) ; |
| break; |
| case 1: |
| for (k = 0; |
| k < line1->len && |
| line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); |
| ++k) ; |
| break; |
| case 2: |
| for (k = 0; |
| k < line1->len && |
| line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); |
| ++k) ; |
| break; |
| case 3: |
| for (k = 0; |
| k < line1->len && |
| line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); |
| ++k) ; |
| break; |
| } |
| col2 = line1->col[k]; |
| } |
| if (col2 > col1) { |
| col1 = col2; |
| } |
| } |
| for (k = 0; k <= line0->len; ++k) { |
| line0->col[k] += col1; |
| } |
| if (line0->col[line0->len] > nColumns) { |
| nColumns = line0->col[line0->len]; |
| } |
| } |
| } |
| gfree(lineArray); |
| } |
| |
| void TextBlock::updatePriMinMax(TextBlock *blk) { |
| double newPriMin, newPriMax; |
| bool gotPriMin, gotPriMax; |
| |
| gotPriMin = gotPriMax = false; |
| newPriMin = newPriMax = 0; // make gcc happy |
| switch (page->primaryRot) { |
| case 0: |
| case 2: |
| if (blk->yMin < yMax && blk->yMax > yMin) { |
| if (blk->xMin < xMin) { |
| newPriMin = blk->xMax; |
| gotPriMin = true; |
| } |
| if (blk->xMax > xMax) { |
| newPriMax = blk->xMin; |
| gotPriMax = true; |
| } |
| } |
| break; |
| case 1: |
| case 3: |
| if (blk->xMin < xMax && blk->xMax > xMin) { |
| if (blk->yMin < yMin) { |
| newPriMin = blk->yMax; |
| gotPriMin = true; |
| } |
| if (blk->yMax > yMax) { |
| newPriMax = blk->yMin; |
| gotPriMax = true; |
| } |
| } |
| break; |
| } |
| if (gotPriMin) { |
| if (newPriMin > xMin) { |
| newPriMin = xMin; |
| } |
| if (newPriMin > priMin) { |
| priMin = newPriMin; |
| } |
| } |
| if (gotPriMax) { |
| if (newPriMax < xMax) { |
| newPriMax = xMax; |
| } |
| if (newPriMax < priMax) { |
| priMax = newPriMax; |
| } |
| } |
| } |
| |
| int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) { |
| TextBlock *blk1 = *(TextBlock **)p1; |
| TextBlock *blk2 = *(TextBlock **)p2; |
| double cmp; |
| |
| cmp = 0; // make gcc happy |
| switch (blk1->page->primaryRot) { |
| case 0: |
| if ((cmp = blk1->xMin - blk2->xMin) == 0) { |
| cmp = blk1->yMin - blk2->yMin; |
| } |
| break; |
| case 1: |
| if ((cmp = blk1->yMin - blk2->yMin) == 0) { |
| cmp = blk2->xMax - blk1->xMax; |
| } |
| break; |
| case 2: |
| if ((cmp = blk2->xMax - blk1->xMax) == 0) { |
| cmp = blk2->yMin - blk1->yMin; |
| } |
| break; |
| case 3: |
| if ((cmp = blk2->yMax - blk1->yMax) == 0) { |
| cmp = blk1->xMax - blk2->xMax; |
| } |
| break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) { |
| TextBlock *blk1 = *(TextBlock **)p1; |
| TextBlock *blk2 = *(TextBlock **)p2; |
| double cmp; |
| |
| cmp = 0; // make gcc happy |
| switch (blk1->page->primaryRot) { |
| case 0: |
| if ((cmp = blk1->yMin - blk2->yMin) == 0) { |
| cmp = blk1->xMin - blk2->xMin; |
| } |
| break; |
| case 1: |
| if ((cmp = blk2->xMax - blk1->xMax) == 0) { |
| cmp = blk1->yMin - blk2->yMin; |
| } |
| break; |
| case 2: |
| if ((cmp = blk2->yMin - blk1->yMin) == 0) { |
| cmp = blk2->xMax - blk1->xMax; |
| } |
| break; |
| case 3: |
| if ((cmp = blk1->xMax - blk2->xMax) == 0) { |
| cmp = blk2->yMax - blk1->yMax; |
| } |
| break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| int TextBlock::primaryCmp(TextBlock *blk) { |
| double cmp; |
| |
| cmp = 0; // make gcc happy |
| switch (rot) { |
| case 0: |
| cmp = xMin - blk->xMin; |
| break; |
| case 1: |
| cmp = yMin - blk->yMin; |
| break; |
| case 2: |
| cmp = blk->xMax - xMax; |
| break; |
| case 3: |
| cmp = blk->yMax - yMax; |
| break; |
| } |
| return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
| } |
| |
| double TextBlock::secondaryDelta(TextBlock *blk) { |
| double delta; |
| |
| delta = 0; // make gcc happy |
| switch (rot) { |
| case 0: |
| delta = blk->yMin - yMax; |
| break; |
| case 1: |
| delta = xMin - blk->xMax; |
| break; |
| case 2: |
| delta = yMin - blk->yMax; |
| break; |
| case 3: |
| delta = blk->xMin - xMax; |
| break; |
| } |
| return delta; |
| } |
| |
| bool TextBlock::isBelow(TextBlock *blk) { |
| bool below; |
| |
| below = false; // make gcc happy |
| switch (page->primaryRot) { |
| case 0: |
| below = xMin >= blk->priMin && xMax <= blk->priMax && |
| yMin > blk->yMin; |
| break; |
| case 1: |
| below = yMin >= blk->priMin && yMax <= blk->priMax && |
| xMax < blk->xMax; |
| break; |
| case 2: |
| below = xMin >= blk->priMin && xMax <= blk->priMax && |
| yMax < blk->yMax; |
| break; |
| case 3: |
| below = yMin >= blk->priMin && yMax <= blk->priMax && |
| xMin > blk->xMin; |
| break; |
| } |
| |
| return below; |
| } |
| |
| bool TextBlock::isBeforeByRule1(TextBlock *blk1) { |
| bool before = false; |
| bool overlap = false; |
| |
| switch (this->page->primaryRot) { |
| case 0: |
| case 2: |
| overlap = ((this->ExMin <= blk1->ExMin) && |
| (blk1->ExMin <= this->ExMax)) || |
| ((blk1->ExMin <= this->ExMin) && |
| (this->ExMin <= blk1->ExMax)); |
| break; |
| case 1: |
| case 3: |
| overlap = ((this->EyMin <= blk1->EyMin) && |
| (blk1->EyMin <= this->EyMax)) || |
| ((blk1->EyMin <= this->EyMin) && |
| (this->EyMin <= blk1->EyMax)); |
| break; |
| } |
| switch (this->page->primaryRot) { |
| case 0: |
| before = overlap && this->EyMin < blk1->EyMin; |
| break; |
| case 1: |
| before = overlap && this->ExMax > blk1->ExMax; |
| break; |
| case 2: |
| before = overlap && this->EyMax > blk1->EyMax; |
| break; |
| case 3: |
| before = overlap && this->ExMin < blk1->ExMin; |
| break; |
| } |
| return before; |
| } |
| |
| bool TextBlock::isBeforeByRule2(TextBlock *blk1) { |
| double cmp = 0; |
| int rotLR = rot; |
| |
| if (!page->primaryLR) { |
| rotLR = (rotLR + 2) % 4; |
| } |
| |
| switch (rotLR) { |
| case 0: |
| cmp = ExMax - blk1->ExMin; |
| break; |
| case 1: |
| cmp = EyMin - blk1->EyMax; |
| break; |
| case 2: |
| cmp = blk1->ExMax - ExMin; |
| break; |
| case 3: |
| cmp = blk1->EyMin - EyMax; |
| break; |
| } |
| return cmp <= 0; |
| } |
| |
| // Sort into reading order by performing a topological sort using the rules |
| // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003. |
| // See http://pubs.iupr.org/#2003-breuel-sdiut |
| // Topological sort is done by depth first search, see |
| // http://en.wikipedia.org/wiki/Topological_sorting |
| int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, |
| TextBlock **sorted, int sortPos, |
| bool* visited, |
| TextBlock **cache, int cacheSize) { |
| int pos2; |
| TextBlock *blk1, *blk2, *blk3; |
| bool before; |
| |
| if (visited[pos1]) { |
| return sortPos; |
| } |
| |
| blk1 = this; |
| |
| #if 0 // for debugging |
| printf("visited: %d %.2f..%.2f %.2f..%.2f\n", |
| sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); |
| #endif |
| visited[pos1] = true; |
| pos2 = -1; |
| for (blk2 = blkList; blk2; blk2 = blk2->next) { |
| pos2++; |
| if (visited[pos2]) { |
| // skip visited nodes |
| continue; |
| } |
| before = false; |
| |
| // is blk2 before blk1? (for table entries) |
| if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) { |
| if (page->primaryLR) { |
| if (blk2->xMax <= blk1->xMin && |
| blk2->yMin <= blk1->yMax && |
| blk2->yMax >= blk1->yMin) |
| before = true; |
| } else { |
| if (blk2->xMin >= blk1->xMax && |
| blk2->yMin <= blk1->yMax && |
| blk2->yMax >= blk1->yMin) |
| before = true; |
| } |
| |
| if (blk2->yMax <= blk1->yMin) |
| before = true; |
| } else { |
| if (blk2->isBeforeByRule1(blk1)) { |
| // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1. |
| before = true; |
| #if 0 // for debugging |
| printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n", |
| blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax, |
| blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); |
| #endif |
| } else if (blk2->isBeforeByRule2(blk1)) { |
| // Rule (2) blk2 left of blk1, and no intervening blk3 |
| // such that blk1 is before blk3 by rule 1, |
| // and blk3 is before blk2 by rule 1. |
| before = true; |
| for (int i = 0; i < cacheSize && cache[i]; ++i) { |
| if (blk1->isBeforeByRule1(cache[i]) && |
| cache[i]->isBeforeByRule1(blk2)) { |
| before = false; |
| std::rotate(cache, cache + i, cache + i + 1); |
| break; |
| } |
| } |
| |
| if (before) { |
| for (blk3 = blkList; blk3; blk3 = blk3->next) { |
| if (blk3 == blk2 || blk3 == blk1) { |
| continue; |
| } |
| if (blk1->isBeforeByRule1(blk3) && |
| blk3->isBeforeByRule1(blk2)) { |
| before = false; |
| std::copy_backward(cache, cache + cacheSize - 1, |
| cache + cacheSize); |
| cache[0] = blk3; |
| break; |
| } |
| } |
| } |
| #if 0 // for debugging |
| if (before) { |
| printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n", |
| blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax, |
| blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax); |
| } |
| #endif |
| } |
| } |
| if (before) { |
| // blk2 is before blk1, so it needs to be visited |
| // before we can add blk1 to the sorted list. |
| sortPos = blk2->visitDepthFirst(blkList, pos2, sorted, sortPos, visited, cache, cacheSize); |
| } |
| } |
| #if 0 // for debugging |
| printf("sorted: %d %.2f..%.2f %.2f..%.2f\n", |
| sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); |
| #endif |
| sorted[sortPos++] = blk1; |
| return sortPos; |
| } |
| |
| int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, |
| TextBlock **sorted, int sortPos, |
| bool* visited) { |
| const int blockCacheSize = 4; |
| TextBlock *blockCache[blockCacheSize]; |
| std::fill(blockCache, blockCache + blockCacheSize, nullptr); |
| return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, blockCache, |
| blockCacheSize); |
| } |
| |
| //------------------------------------------------------------------------ |
| // TextFlow |
| //------------------------------------------------------------------------ |
| |
| TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) { |
| page = pageA; |
| xMin = blk->xMin; |
| xMax = blk->xMax; |
| yMin = blk->yMin; |
| yMax = blk->yMax; |
| priMin = blk->priMin; |
| priMax = blk->priMax; |
| blocks = lastBlk = blk; |
| next = nullptr; |
| } |
| |
| TextFlow::~TextFlow() { |
| TextBlock *blk; |
| |
| while (blocks) { |
| blk = blocks; |
| blocks = blocks->next; |
| delete blk; |
| } |
| } |
| |
| void TextFlow::addBlock(TextBlock *blk) { |
| if (lastBlk) { |
| lastBlk->next = blk; |
| } else { |
| blocks = blk; |
| } |
| lastBlk = blk; |
| if (blk->xMin < xMin) { |
| xMin = blk->xMin; |
| } |
| if (blk->xMax > xMax) { |
| xMax = blk->xMax; |
| } |
| if (blk->yMin < yMin) { |
| yMin = blk->yMin; |
| } |
| if (blk->yMax > yMax) { |
| yMax = blk->yMax; |
| } |
| } |
| |
| bool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) { |
| bool fits; |
| |
| // lower blocks must use smaller fonts |
| if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) { |
| return false; |
| } |
| |
| fits = false; // make gcc happy |
| switch (page->primaryRot) { |
| case 0: |
| fits = blk->xMin >= priMin && blk->xMax <= priMax; |
| break; |
| case 1: |
| fits = blk->yMin >= priMin && blk->yMax <= priMax; |
| break; |
| case 2: |
| fits = blk->xMin >= priMin && blk->xMax <= priMax; |
| break; |
| case 3: |
| fits = blk->yMin >= priMin && blk->yMax <= priMax; |
| break; |
| } |
| return fits; |
| } |
| |
| #ifdef TEXTOUT_WORD_LIST |
| |
| //------------------------------------------------------------------------ |
| // TextWordList |
| //------------------------------------------------------------------------ |
| |
| TextWordList::TextWordList(TextPage *text, bool physLayout) { |
| TextFlow *flow; |
| TextBlock *blk; |
| TextLine *line; |
| TextWord *word; |
| TextWord **wordArray; |
| int nWords, i; |
| |
| words = new std::vector<TextWord*>(); |
| |
| if (text->rawOrder) { |
| for (word = text->rawWords; word; word = word->next) { |
| words->push_back(word); |
| } |
| |
| } else if (physLayout) { |
| // this is inefficient, but it's also the least useful of these |
| // three cases |
| nWords = 0; |
| for (flow = text->flows; flow; flow = flow->next) { |
| for (blk = flow->blocks; blk; blk = blk->next) { |
| for (line = blk->lines; line; line = line->next) { |
| for (word = line->words; word; word = word->next) { |
| ++nWords; |
| } |
| } |
| } |
| } |
| wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *)); |
| i = 0; |
| for (flow = text->flows; flow; flow = flow->next) { |
| for (blk = flow->blocks; blk; blk = blk->next) { |
| for (line = blk->lines; line; line = line->next) { |
| for (word = line->words; word; word = word->next) { |
| wordArray[i++] = word; |
| } |
| } |
| } |
| } |
| qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX); |
| for (i = 0; i < nWords; ++i) { |
| words->push_back(wordArray[i]); |
| } |
| gfree(wordArray); |
| |
| } else { |
| for (flow = text->flows; flow; flow = flow->next) { |
| for (blk = flow->blocks; blk; blk = blk->next) { |
| for (line = blk->lines; line; line = line->next) { |
| for (word = line->words; word; word = word->next) { |
| words->push_back(word); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| TextWordList::~TextWordList() { |
| delete words; |
| } |
| |
| int TextWordList::getLength() { |
| return words->size(); |
| } |
| |
| TextWord *TextWordList::get(int idx) { |
| if (idx < 0 || idx >= (int)words->size()) { |
| return nullptr; |
| } |
| return (*words)[idx]; |
| } |
| |
| #endif // TEXTOUT_WORD_LIST |
| |
| //------------------------------------------------------------------------ |
| // TextPage |
| //------------------------------------------------------------------------ |
| |
| TextPage::TextPage(bool rawOrderA) { |
| int rot; |
| |
| refCnt = 1; |
| rawOrder = rawOrderA; |
| curWord = nullptr; |
| charPos = 0; |
| curFont = nullptr; |
| curFontSize = 0; |
| nest = 0; |
| nTinyChars = 0; |
| lastCharOverlap = false; |
| if (!rawOrder) { |
| for (rot = 0; rot < 4; ++rot) { |
| pools[rot] = new TextPool(); |
| } |
| } |
| flows = nullptr; |
| blocks = nullptr; |
| rawWords = nullptr; |
| rawLastWord = nullptr; |
| fonts = new std::vector<TextFontInfo*>(); |
| lastFindXMin = lastFindYMin = 0; |
| haveLastFind = false; |
| underlines = new std::vector<TextUnderline*>(); |
| links = new std::vector<TextLink*>(); |
| mergeCombining = true; |
| } |
| |
| TextPage::~TextPage() { |
| int rot; |
| |
| clear(); |
| if (!rawOrder) { |
| for (rot = 0; rot < 4; ++rot) { |
| delete pools[rot]; |
| } |
| } |
| delete fonts; |
| for (auto entry : *underlines) { |
| delete entry; |
| } |
| delete underlines; |
| for (auto entry : *links) { |
| delete entry; |
| } |
| delete links; |
| } |
| |
| void TextPage::incRefCnt() { |
| refCnt++; |
| } |
| |
| void TextPage::decRefCnt() { |
| if (--refCnt == 0) |
| delete this; |
| } |
| |
| void TextPage::startPage(GfxState *state) { |
| clear(); |
| if (state) { |
| pageWidth = state->getPageWidth(); |
| pageHeight = state->getPageHeight(); |
| } else { |
| pageWidth = pageHeight = 0; |
| } |
| } |
| |
| void TextPage::endPage() { |
| if (curWord) { |
| endWord(); |
| } |
| } |
| |
| void TextPage::clear() { |
| int rot; |
| TextFlow *flow; |
| TextWord *word; |
| |
| if (curWord) { |
| delete curWord; |
| curWord = nullptr; |
| } |
| if (rawOrder) { |
| while (rawWords) { |
| word = rawWords; |
| rawWords = rawWords->next; |
| delete word; |
| } |
| } else { |
| for (rot = 0; rot < 4; ++rot) { |
| delete pools[rot]; |
| } |
| while (flows) { |
| flow = flows; |
| flows = flows->next; |
| delete flow; |
| } |
| gfree(blocks); |
| } |
| for (auto entry : *fonts) { |
| delete entry; |
| } |
| delete fonts; |
| for (auto entry : *underlines) { |
| delete entry; |
| } |
| delete underlines; |
| for (auto entry : *links) { |
| delete entry; |
| } |
| delete links; |
| |
| curWord = nullptr; |
| charPos = 0; |
| curFont = nullptr; |
| curFontSize = 0; |
| nest = 0; |
| nTinyChars = 0; |
| if (!rawOrder) { |
| for (rot = 0; rot < 4; ++rot) { |
| pools[rot] = new TextPool(); |
| } |
| } |
| flows = nullptr; |
| blocks = nullptr; |
| rawWords = nullptr; |
| rawLastWord = nullptr; |
| fonts = new std::vector<TextFontInfo*>(); |
| underlines = new std::vector<TextUnderline*>(); |
| links = new std::vector<TextLink*>(); |
| } |
| |
| void TextPage::updateFont(GfxState *state) { |
| GfxFont *gfxFont; |
| const double *fm; |
| const char *name; |
| int code, mCode, letterCode, anyCode; |
| double w; |
| |
| // get the font info object |
| curFont = nullptr; |
| for (std::size_t i = 0; i < fonts->size(); ++i) { |
| curFont = (*fonts)[i]; |
| if (curFont->matches(state)) { |
| break; |
| } |
| curFont = nullptr; |
| } |
| if (!curFont) { |
| curFont = new TextFontInfo(state); |
| fonts->push_back(curFont); |
| } |
| |
| // adjust the font size |
| gfxFont = state->getFont(); |
| curFontSize = state->getTransformedFontSize(); |
| if (gfxFont && gfxFont->getType() == fontType3) { |
| // This is a hack which makes it possible to deal with some Type 3 |
| // fonts. The problem is that it's impossible to know what the |
| // base coordinate system used in the font is without actually |
| // rendering the font. This code tries to guess by looking at the |
| // width of the character 'm' (which breaks if the font is a |
| // subset that doesn't contain 'm'). |
| mCode = letterCode = anyCode = -1; |
| for (code = 0; code < 256; ++code) { |
| name = ((Gfx8BitFont *)gfxFont)->getCharName(code); |
| int nameLen = name ? strlen(name) : 0; |
| bool nameOneChar = nameLen == 1 || (nameLen > 1 && name[1] == '\0'); |
| if (nameOneChar && name[0] == 'm') { |
| mCode = code; |
| } |
| if (letterCode < 0 && nameOneChar && |
| ((name[0] >= 'A' && name[0] <= 'Z') || |
| (name[0] >= 'a' && name[0] <= 'z'))) { |
| letterCode = code; |
| } |
| if (anyCode < 0 && name && |
| ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) { |
| anyCode = code; |
| } |
| } |
| if (mCode >= 0 && |
| (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) { |
| // 0.6 is a generic average 'm' width -- yes, this is a hack |
| curFontSize *= w / 0.6; |
| } else if (letterCode >= 0 && |
| (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) { |
| // even more of a hack: 0.5 is a generic letter width |
| curFontSize *= w / 0.5; |
| } else if (anyCode >= 0 && |
| (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) { |
| // better than nothing: 0.5 is a generic character width |
| curFontSize *= w / 0.5; |
| } |
| fm = gfxFont->getFontMatrix(); |
| if (fm[0] != 0) { |
| curFontSize *= fabs(fm[3] / fm[0]); |
| } |
| } |
| } |
| |
| void TextPage::beginWord(GfxState *state) { |
| GfxFont *gfxFont; |
| const double *fontm; |
| double m[4], m2[4]; |
| int rot; |
| |
| // This check is needed because Type 3 characters can contain |
| // text-drawing operations (when TextPage is being used via |
| // {X,Win}SplashOutputDev rather than TextOutputDev). |
| if (curWord) { |
| ++nest; |
| return; |
| } |
| |
| // compute the rotation |
| state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]); |
| gfxFont = state->getFont(); |
| if (gfxFont && gfxFont->getType() == fontType3) { |
| fontm = state->getFont()->getFontMatrix(); |
| m2[0] = fontm[0] * m[0] + fontm[1] * m[2]; |
| m2[1] = fontm[0] * m[1] + fontm[1] * m[3]; |
| m2[2] = fontm[2] * m[0] + fontm[3] * m[2]; |
| m2[3] = fontm[2] * m[1] + fontm[3] * m[3]; |
| m[0] = m2[0]; |
| m[1] = m2[1]; |
| m[2] = m2[2]; |
| m[3] = m2[3]; |
| } |
| if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) { |
| rot = (m[0] > 0 || m[3] < 0) ? 0 : 2; |
| } else { |
| rot = (m[2] > 0) ? 1 : 3; |
| } |
| |
| // for vertical writing mode, the lines are effectively rotated 90 |
| // degrees |
| if (gfxFont && gfxFont->getWMode()) { |
| rot = (rot + 1) & 3; |
| } |
| |
| curWord = new TextWord(state, rot, curFontSize); |
| } |
| |
| void TextPage::addChar(GfxState *state, double x, double y, |
| double dx, double dy, |
| CharCode c, int nBytes, Unicode *u, int uLen) { |
| double x1, y1, w1, h1, dx2, dy2, base, sp, delta; |
| bool overlap; |
| int i; |
| int wMode; |
| Matrix mat; |
| |
| // subtract char and word spacing from the dx,dy values |
| sp = state->getCharSpace(); |
| if (c == (CharCode)0x20) { |
| sp += state->getWordSpace(); |
| } |
| state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); |
| dx -= dx2; |
| dy -= dy2; |
| state->transformDelta(dx, dy, &w1, &h1); |
| |
| // throw away chars that aren't inside the page bounds |
| // (and also do a sanity check on the character size) |
| state->transform(x, y, &x1, &y1); |
| if (x1 + w1 < 0 || x1 > pageWidth || |
| y1 + h1 < 0 || y1 > pageHeight || |
| x1 != x1 || y1 != y1 || // IEEE way of checking for isnan |
| w1 != w1 || h1 != h1) { |
| charPos += nBytes; |
| return; |
| } |
| |
| // check the tiny chars limit |
| if (fabs(w1) < 3 && fabs(h1) < 3) { |
| if (++nTinyChars > 50000) { |
| charPos += nBytes; |
| return; |
| } |
| } |
| |
| // break words at space character |
| if (uLen == 1 && UnicodeIsWhitespace(u[0])) { |
| charPos += nBytes; |
| endWord(); |
| return; |
| } else if (uLen == 1 && u[0] == (Unicode)0x0) { |
| // ignore null characters |
| charPos += nBytes; |
| return; |
| } |
| |
| state->getFontTransMat(&mat.m[0], &mat.m[1], &mat.m[2], &mat.m[3]); |
| mat.m[0] *= state->getHorizScaling(); |
| mat.m[1] *= state->getHorizScaling(); |
| mat.m[4] = x1; |
| mat.m[5] = y1; |
| |
| if (mergeCombining && curWord && uLen == 1 && |
| curWord->addCombining(state, curFont, curFontSize, x1, y1, w1, h1, charPos, nBytes, c, |
| u[0], mat)) { |
| charPos += nBytes; |
| return; |
| } |
| |
| // start a new word if: |
| // (1) this character doesn't fall in the right place relative to |
| // the end of the previous word (this places upper and lower |
| // constraints on the position deltas along both the primary |
| // and secondary axes), or |
| // (2) this character overlaps the previous one (duplicated text), or |
| // (3) the previous character was an overlap (we want each duplicated |
| // character to be in a word by itself at this stage), |
| // (4) the font size has changed |
| // (5) the WMode changed |
| if (curWord && curWord->len > 0) { |
| base = sp = delta = 0; // make gcc happy |
| switch (curWord->rot) { |
| case 0: |
| base = y1; |
| sp = x1 - curWord->xMax; |
| delta = x1 - curWord->edge[curWord->len - 1]; |
| break; |
| case 1: |
| base = x1; |
| sp = y1 - curWord->yMax; |
| delta = y1 - curWord->edge[curWord->len - 1]; |
| break; |
| case 2: |
| base = y1; |
| sp = curWord->xMin - x1; |
| delta = curWord->edge[curWord->len - 1] - x1; |
| break; |
| case 3: |
| base = x1; |
| sp = curWord->yMin - y1; |
| delta = curWord->edge[curWord->len - 1] - y1; |
| break; |
| } |
| overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize && |
| fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize; |
| wMode = curFont->getWMode(); |
| if (overlap || lastCharOverlap || |
| sp < -minDupBreakOverlap * curWord->fontSize || |
| sp > minWordBreakSpace * curWord->fontSize || |
| fabs(base - curWord->base) > 0.5 || |
| curFontSize != curWord->fontSize || |
| wMode != curWord->wMode) { |
| endWord(); |
| } |
| lastCharOverlap = overlap; |
| } else { |
| lastCharOverlap = false; |
| } |
| |
| if (uLen != 0) { |
| // start a new word if needed |
| if (!curWord) { |
| beginWord(state); |
| } |
| |
| // page rotation and/or transform matrices can cause text to be |
| // drawn in reverse order -- in this case, swap the begin/end |
| // coordinates and break text into individual chars |
| if ((curWord->rot == 0 && w1 < 0) || |
| (curWord->rot == 1 && h1 < 0) || |
| (curWord->rot == 2 && w1 > 0) || |
| (curWord->rot == 3 && h1 > 0)) { |
| endWord(); |
| beginWord(state); |
| x1 += w1; |
| y1 += h1; |
| w1 = -w1; |
| h1 = -h1; |
| } |
| |
| // add the characters to the current word |
| w1 /= uLen; |
| h1 /= uLen; |
| for (i = 0; i < uLen; ++i) { |
| curWord->addChar(state, curFont, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i], mat); |
| } |
| |