| //======================================================================== |
| // |
| // StructElement.h |
| // |
| // This file is licensed under the GPLv2 or later |
| // |
| // Copyright 2013, 2014 Igalia S.L. |
| // Copyright 2014 Luigi Scarso <luigi.scarso@gmail.com> |
| // Copyright 2014, 2018, 2019 Albert Astals Cid <aacid@kde.org> |
| // Copyright 2018 Adam Reichold <adam.reichold@t-online.de> |
| // |
| //======================================================================== |
| |
| #ifndef STRUCTELEMENT_H |
| #define STRUCTELEMENT_H |
| |
| #include "goo/GooString.h" |
| #include "MarkedContentOutputDev.h" |
| #include "Object.h" |
| #include <vector> |
| #include <set> |
| |
| class GooString; |
| class Dict; |
| class StructElement; |
| class StructTreeRoot; |
| |
| |
| class Attribute { |
| public: |
| enum Type { |
| Unknown = 0, // Uninitialized, parsing error, etc. |
| UserProperty, // User defined attribute (i.e. non-standard) |
| |
| // Common standard attributes |
| Placement, WritingMode, BackgroundColor, BorderColor, BorderStyle, |
| BorderThickness, Color, Padding, |
| |
| // Block element standard attributes |
| SpaceBefore, SpaceAfter, StartIndent, EndIndent, TextIndent, TextAlign, |
| BBox, Width, Height, BlockAlign, InlineAlign, TBorderStyle, TPadding, |
| |
| // Inline element standard attributes |
| BaselineShift, LineHeight, TextDecorationColor, TextDecorationThickness, |
| TextDecorationType, RubyAlign, RubyPosition, GlyphOrientationVertical, |
| |
| // Column-only standard attributes |
| ColumnCount, ColumnGap, ColumnWidths, |
| |
| // List-only standard attributes |
| ListNumbering, |
| |
| // PrintField-only standard attributes |
| Role, checked, Desc, |
| |
| // Table-only standard attributes |
| RowSpan, ColSpan, Headers, Scope, Summary, |
| }; |
| |
| enum Owner { |
| UnknownOwner = 0, |
| // User-defined attributes |
| UserProperties, |
| // Standard attributes |
| Layout, List, PrintField, Table, |
| // Translation to other formats |
| XML_1_00, HTML_3_20, HTML_4_01, OEB_1_00, RTF_1_05, CSS_1_00, CSS_2_00, |
| }; |
| |
| // Creates a standard attribute. The name is predefined, and the |
| // value is type-checked to conform to the PDF specification. |
| Attribute(Type type, Object *value); |
| |
| // Creates an UserProperty attribute, with an arbitrary name and value. |
| Attribute(GooString &&name, Object *value); |
| |
| bool isOk() const { return type != Unknown; } |
| |
| // Name, type and value can be set only on construction. |
| Type getType() const { return type; } |
| Owner getOwner() const { return owner; } |
| const char *getTypeName() const; |
| const char *getOwnerName() const; |
| Object *getValue() const { return &value; } |
| static Object *getDefaultValue(Type type); |
| |
| // The caller gets the ownership of the return GooString and is responsible of deleting it |
| GooString *getName() const { return type == UserProperty ? name.copy() : new GooString(getTypeName()); } |
| |
| // The revision is optional, and defaults to zero. |
| unsigned int getRevision() const { return revision; } |
| void setRevision(unsigned int revisionA) { revision = revisionA; } |
| |
| // Hidden elements should not be displayed by the user agent |
| bool isHidden() const { return hidden; } |
| void setHidden(bool hiddenA) { hidden = hiddenA; } |
| |
| // The formatted value may be in the PDF, or be left undefined (nullptr). |
| // In the later case the user agent should provide a default representation. |
| const char *getFormattedValue() const { return formatted ? formatted->c_str() : nullptr; } |
| void setFormattedValue(const char *formattedA); |
| |
| ~Attribute(); |
| |
| private: |
| Type type; |
| Owner owner; |
| unsigned int revision; |
| mutable GooString name; |
| mutable Object value; |
| bool hidden; |
| GooString *formatted; |
| |
| bool checkType(StructElement *element = nullptr); |
| static Type getTypeForName(const char *name, StructElement *element = nullptr); |
| static Attribute *parseUserProperty(Dict *property); |
| |
| friend class StructElement; |
| }; |
| |
| |
| class StructElement { |
| public: |
| enum Type { |
| Unknown = 0, |
| MCID, // MCID reference, used internally |
| OBJR, // Object reference, used internally |
| |
| Document, Part, Art, Sect, Div, // Structural elements |
| |
| Span, Quote, Note, Reference, BibEntry, // Inline elements |
| Code, Link, Annot, |
| BlockQuote, Caption, NonStruct, |
| TOC, TOCI, Index, Private, |
| |
| P, H, H1, H2, H3, H4, H5, H6, // Paragraph-like |
| |
| L, LI, Lbl, LBody, // List elements |
| |
| Table, TR, TH, TD, THead, TFoot, TBody, // Table elements |
| |
| Ruby, RB, RT, RP, // Ruby text elements |
| Warichu, WT, WP, |
| |
| Figure, Formula, Form, // Illustration-like elements |
| }; |
| |
| static const Ref InvalidRef; |
| |
| const char *getTypeName() const; |
| Type getType() const { return type; } |
| bool isOk() const { return type != Unknown; } |
| bool isBlock() const; |
| bool isInline() const; |
| bool isGrouping() const; |
| |
| inline bool isContent() const { return (type == MCID) || isObjectRef(); } |
| inline bool isObjectRef() const { return (type == OBJR && c->ref != Ref::INVALID()); } |
| |
| int getMCID() const { return c->mcid; } |
| Ref getObjectRef() const { return c->ref; } |
| Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef; } |
| bool hasPageRef() const; |
| bool getPageRef(Ref& ref) const; |
| StructTreeRoot *getStructTreeRoot() { return treeRoot; } |
| |
| // Optional element identifier. |
| const GooString *getID() const { return isContent() ? nullptr : s->id; } |
| GooString *getID() { return isContent() ? nullptr : s->id; } |
| |
| // Optional ISO language name, e.g. en_US |
| GooString *getLanguage() { |
| if (!isContent() && s->language) return s->language; |
| return parent ? parent->getLanguage() : nullptr; |
| } |
| const GooString *getLanguage() const { |
| if (!isContent() && s->language) return s->language; |
| return parent ? parent->getLanguage() : nullptr; |
| } |
| |
| // Optional revision number, defaults to zero. |
| unsigned int getRevision() const { return isContent() ? 0 : s->revision; } |
| void setRevision(unsigned int revision) { if (isContent()) s->revision = revision; } |
| |
| // Optional element title, in human-readable form. |
| const GooString *getTitle() const { return isContent() ? nullptr : s->title; } |
| GooString *getTitle() { return isContent() ? nullptr : s->title; } |
| |
| // Optional element expanded abbreviation text. |
| const GooString *getExpandedAbbr() const { return isContent() ? nullptr : s->expandedAbbr; } |
| GooString *getExpandedAbbr() { return isContent() ? nullptr : s->expandedAbbr; } |
| |
| unsigned getNumChildren() const { return isContent() ? 0 : s->elements.size(); } |
| const StructElement *getChild(int i) const { return isContent() ? nullptr : s->elements.at(i); } |
| StructElement *getChild(int i) { return isContent() ? nullptr : s->elements.at(i); } |
| |
| void appendChild(StructElement *element) { |
| if (!isContent() && element && element->isOk()) { |
| s->elements.push_back(element); |
| } |
| } |
| |
| unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); } |
| const Attribute *getAttribute(int i) const { return isContent() ? nullptr : s->attributes.at(i); } |
| Attribute *getAttribute(int i) { return isContent() ? nullptr : s->attributes.at(i); } |
| |
| void appendAttribute(Attribute *attribute) { |
| if (!isContent() && attribute) { |
| s->attributes.push_back(attribute); |
| } |
| } |
| |
| const Attribute* findAttribute(Attribute::Type attributeType, bool inherit = false, |
| Attribute::Owner owner = Attribute::UnknownOwner) const; |
| |
| const GooString *getAltText() const { return isContent() ? nullptr : s->altText; } |
| GooString *getAltText() { return isContent() ? nullptr : s->altText; } |
| |
| const GooString *getActualText() const { return isContent() ? nullptr : s->actualText; } |
| GooString *getActualText() { return isContent() ? nullptr : s->actualText; } |
| |
| // Content text referenced by the element: |
| // |
| // - For MCID reference elements, this is just the text of the |
| // corresponding marked content object in the page stream, regardless |
| // of the setting of the "recursive" flag. |
| // - For other elements, if the "recursive" flag is set, the text |
| // enclosed by *all* the child MCID reference elements of the subtree |
| // is returned. The text is assembled by traversing the leaf MCID |
| // reference elements in logical order. |
| // - In any other case, the function returns nullptr. |
| // |
| // A new string is returned, and the ownership passed to the caller. |
| // |
| GooString *getText(bool recursive = true) const { |
| return appendSubTreeText(nullptr, recursive); |
| } |
| |
| const TextSpanArray getTextSpans() const { |
| if (!isContent()) |
| return TextSpanArray(); |
| MarkedContentOutputDev mcdev(getMCID()); |
| return getTextSpansInternal(mcdev); |
| } |
| |
| ~StructElement(); |
| |
| private: |
| GooString* appendSubTreeText(GooString *string, bool recursive) const; |
| const TextSpanArray& getTextSpansInternal(MarkedContentOutputDev& mcdev) const; |
| |
| typedef std::vector<Attribute*> AttrPtrArray; |
| typedef std::vector<StructElement*> ElemPtrArray; |
| |
| struct StructData { |
| Ref parentRef; |
| GooString *altText; |
| GooString *actualText; |
| GooString *id; |
| GooString *title; |
| GooString *expandedAbbr; |
| GooString *language; |
| unsigned int revision; |
| ElemPtrArray elements; |
| AttrPtrArray attributes; |
| |
| StructData(); |
| ~StructData(); |
| |
| StructData(const StructData &) = delete; |
| StructData& operator=(const StructData &) = delete; |
| }; |
| |
| // Data in content elements (MCID, MCR) |
| struct ContentData { |
| union { |
| int mcid; |
| Ref ref; |
| }; |
| |
| ContentData(int mcidA): mcid(mcidA) {} |
| ContentData(const Ref r) { ref = r; } |
| }; |
| |
| // Common data |
| Type type; |
| StructTreeRoot *treeRoot; |
| StructElement *parent; |
| mutable Object pageRef; |
| |
| union { |
| StructData *s; |
| ContentData *c; |
| }; |
| |
| StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA, std::set<int> &seen); |
| StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA); |
| StructElement(const Ref ref, StructTreeRoot *treeRootA, StructElement *parentA); |
| |
| void parse(Dict* elementDict); |
| StructElement* parseChild(const Object *ref, Object* childObj, std::set<int> &seen); |
| void parseChildren(Dict* element, std::set<int> &seen); |
| void parseAttributes(Dict *element, bool keepExisting = false); |
| |
| friend class StructTreeRoot; |
| }; |
| |
| #endif |
| |