blob: 8f1849e730315e95e6e624222bfc9bedec96b252 [file] [log] [blame]
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "src/pdf/SkPDFTag.h"
#include "include/core/SkPoint.h"
#include "include/core/SkScalar.h"
#include "include/private/base/SkAssert.h"
#include "include/private/base/SkTo.h"
#include "src/pdf/SkPDFDocumentPriv.h"
#include <algorithm>
#include <memory>
#include <utility>
using namespace skia_private;
// The struct parent tree consists of one entry per page, followed by
// entries for individual struct tree nodes corresponding to
// annotations. Each entry is a key/value pair with an integer key
// and an indirect reference key.
//
// The page entries get consecutive keys starting at 0. Since we don't
// know the total number of pages in the document at the time we start
// processing annotations, start the key for annotations with a large
// number, which effectively becomes the maximum number of pages in a
// PDF we can handle.
const int kFirstAnnotationStructParentKey = 100000;
namespace {
struct Location {
SkPoint fPoint{SK_ScalarNaN, SK_ScalarNaN};
unsigned fPageIndex{0};
void accumulate(Location const& child) {
if (!child.fPoint.isFinite()) {
return;
}
if (!fPoint.isFinite()) {
*this = child;
return;
}
if (child.fPageIndex < fPageIndex) {
*this = child;
return;
}
if (child.fPageIndex == fPageIndex) {
fPoint.fX = std::min(child.fPoint.fX, fPoint.fX);
fPoint.fY = std::max(child.fPoint.fY, fPoint.fY); // PDF y-up
return;
}
}
};
} // namespace
struct SkPDFTagNode {
// Structure element nodes need a unique alphanumeric ID,
// and we need to be able to output them sorted in lexicographic
// order. This helper function takes one of our node IDs and
// builds an ID string that zero-pads the digits so that lexicographic
// order matches numeric order.
static SkString nodeIdToString(int nodeId) {
SkString idString;
idString.printf("node%08d", nodeId);
return idString;
}
SkPDFTagNode* fChildren = nullptr;
size_t fChildCount = 0;
struct MarkedContentInfo {
Location fLocation;
int fMarkId;
};
TArray<MarkedContentInfo> fMarkedContent;
int fNodeId;
bool fWantTitle;
SkString fTypeString;
SkString fTitle;
SkString fAlt;
SkString fLang;
SkPDFIndirectReference fRef;
enum State {
kUnknown,
kYes,
kNo,
} fCanDiscard = kUnknown;
std::unique_ptr<SkPDFArray> fAttributes;
struct AnnotationInfo {
unsigned fPageIndex;
SkPDFIndirectReference fAnnotationRef;
};
std::vector<AnnotationInfo> fAnnotations;
};
SkPDF::AttributeList::AttributeList() = default;
SkPDF::AttributeList::~AttributeList() = default;
void SkPDF::AttributeList::appendInt(
const char* owner, const char* name, int value) {
if (!fAttrs)
fAttrs = SkPDFMakeArray();
std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
attrDict->insertName("O", owner);
attrDict->insertInt(name, value);
fAttrs->appendObject(std::move(attrDict));
}
void SkPDF::AttributeList::appendFloat(
const char* owner, const char* name, float value) {
if (!fAttrs)
fAttrs = SkPDFMakeArray();
std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
attrDict->insertName("O", owner);
attrDict->insertScalar(name, value);
fAttrs->appendObject(std::move(attrDict));
}
void SkPDF::AttributeList::appendName(
const char* owner, const char* name, const char* value) {
if (!fAttrs)
fAttrs = SkPDFMakeArray();
std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
attrDict->insertName("O", owner);
attrDict->insertName(name, value);
fAttrs->appendObject(std::move(attrDict));
}
void SkPDF::AttributeList::appendFloatArray(
const char* owner, const char* name, const std::vector<float>& value) {
if (!fAttrs)
fAttrs = SkPDFMakeArray();
std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
attrDict->insertName("O", owner);
std::unique_ptr<SkPDFArray> pdfArray = SkPDFMakeArray();
for (float element : value) {
pdfArray->appendScalar(element);
}
attrDict->insertObject(name, std::move(pdfArray));
fAttrs->appendObject(std::move(attrDict));
}
void SkPDF::AttributeList::appendNodeIdArray(
const char* owner,
const char* name,
const std::vector<int>& nodeIds) {
if (!fAttrs)
fAttrs = SkPDFMakeArray();
std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
attrDict->insertName("O", owner);
std::unique_ptr<SkPDFArray> pdfArray = SkPDFMakeArray();
for (int nodeId : nodeIds) {
SkString idString = SkPDFTagNode::nodeIdToString(nodeId);
pdfArray->appendByteString(idString);
}
attrDict->insertObject(name, std::move(pdfArray));
fAttrs->appendObject(std::move(attrDict));
}
SkPDFTagTree::SkPDFTagTree() : fArena(4 * sizeof(SkPDFTagNode)) {}
SkPDFTagTree::~SkPDFTagTree() = default;
// static
void SkPDFTagTree::Copy(SkPDF::StructureElementNode& node,
SkPDFTagNode* dst,
SkArenaAlloc* arena,
THashMap<int, SkPDFTagNode*>* nodeMap,
bool wantTitle) {
nodeMap->set(node.fNodeId, dst);
for (int nodeId : node.fAdditionalNodeIds) {
SkASSERT(!nodeMap->find(nodeId));
nodeMap->set(nodeId, dst);
}
dst->fNodeId = node.fNodeId;
// Accumulate title text, need to be in sync with create_outline_from_headers
const char* type = node.fTypeString.c_str();
wantTitle |= fOutline == SkPDF::Metadata::Outline::StructureElementHeaders &&
type[0] == 'H' && '1' <= type[1] && type[1] <= '6';
dst->fWantTitle = wantTitle;
dst->fTypeString = node.fTypeString;
dst->fAlt = node.fAlt;
dst->fLang = node.fLang;
size_t childCount = node.fChildVector.size();
SkPDFTagNode* children = arena->makeArray<SkPDFTagNode>(childCount);
dst->fChildCount = childCount;
dst->fChildren = children;
for (size_t i = 0; i < childCount; ++i) {
Copy(*node.fChildVector[i], &children[i], arena, nodeMap, wantTitle);
}
dst->fAttributes = std::move(node.fAttributes.fAttrs);
}
void SkPDFTagTree::init(SkPDF::StructureElementNode* node, SkPDF::Metadata::Outline outline) {
if (node) {
fRoot = fArena.make<SkPDFTagNode>();
fOutline = outline;
Copy(*node, fRoot, &fArena, &fNodeMap, false);
}
}
int SkPDFTagTree::Mark::id() {
return fNode ? fNode->fMarkedContent[fMarkIndex].fMarkId : -1;
}
SkPoint& SkPDFTagTree::Mark::point() {
return fNode->fMarkedContent[fMarkIndex].fLocation.fPoint;
}
auto SkPDFTagTree::createMarkIdForNodeId(int nodeId, unsigned pageIndex, SkPoint point) -> Mark {
if (!fRoot) {
return Mark();
}
SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
if (!tagPtr) {
return Mark();
}
SkPDFTagNode* tag = *tagPtr;
SkASSERT(tag);
while (SkToUInt(fMarksPerPage.size()) < pageIndex + 1) {
fMarksPerPage.push_back();
}
TArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[pageIndex];
int markId = pageMarks.size();
tag->fMarkedContent.push_back({{point, pageIndex}, markId});
pageMarks.push_back(tag);
return Mark(tag, tag->fMarkedContent.size() - 1);
}
int SkPDFTagTree::createStructParentKeyForNodeId(int nodeId, unsigned pageIndex) {
if (!fRoot) {
return -1;
}
SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
if (!tagPtr) {
return -1;
}
SkPDFTagNode* tag = *tagPtr;
SkASSERT(tag);
tag->fCanDiscard = SkPDFTagNode::kNo;
int nextStructParentKey = kFirstAnnotationStructParentKey +
static_cast<int>(fParentTreeAnnotationNodeIds.size());
fParentTreeAnnotationNodeIds.push_back(nodeId);
return nextStructParentKey;
}
static bool can_discard(SkPDFTagNode* node) {
if (node->fCanDiscard == SkPDFTagNode::kYes) {
return true;
}
if (node->fCanDiscard == SkPDFTagNode::kNo) {
return false;
}
if (!node->fMarkedContent.empty()) {
node->fCanDiscard = SkPDFTagNode::kNo;
return false;
}
for (size_t i = 0; i < node->fChildCount; ++i) {
if (!can_discard(&node->fChildren[i])) {
node->fCanDiscard = SkPDFTagNode::kNo;
return false;
}
}
node->fCanDiscard = SkPDFTagNode::kYes;
return true;
}
SkPDFIndirectReference SkPDFTagTree::PrepareTagTreeToEmit(SkPDFIndirectReference parent,
SkPDFTagNode* node,
SkPDFDocument* doc) {
SkPDFIndirectReference ref = doc->reserveRef();
std::unique_ptr<SkPDFArray> kids = SkPDFMakeArray();
SkPDFTagNode* children = node->fChildren;
size_t childCount = node->fChildCount;
for (size_t i = 0; i < childCount; ++i) {
SkPDFTagNode* child = &children[i];
if (!(can_discard(child))) {
kids->appendRef(PrepareTagTreeToEmit(ref, child, doc));
}
}
for (const SkPDFTagNode::MarkedContentInfo& info : node->fMarkedContent) {
std::unique_ptr<SkPDFDict> mcr = SkPDFMakeDict("MCR");
mcr->insertRef("Pg", doc->getPage(info.fLocation.fPageIndex));
mcr->insertInt("MCID", info.fMarkId);
kids->appendObject(std::move(mcr));
}
for (const SkPDFTagNode::AnnotationInfo& annotationInfo : node->fAnnotations) {
std::unique_ptr<SkPDFDict> annotationDict = SkPDFMakeDict("OBJR");
annotationDict->insertRef("Obj", annotationInfo.fAnnotationRef);
annotationDict->insertRef("Pg", doc->getPage(annotationInfo.fPageIndex));
kids->appendObject(std::move(annotationDict));
}
node->fRef = ref;
SkPDFDict dict("StructElem");
dict.insertName("S", node->fTypeString.isEmpty() ? "NonStruct" : node->fTypeString.c_str());
if (!node->fAlt.isEmpty()) {
dict.insertTextString("Alt", node->fAlt);
}
if (!node->fLang.isEmpty()) {
dict.insertTextString("Lang", node->fLang);
}
dict.insertRef("P", parent);
dict.insertObject("K", std::move(kids));
if (node->fAttributes) {
dict.insertObject("A", std::move(node->fAttributes));
}
// Each node has a unique ID that also needs to be referenced
// in a separate IDTree node, along with the lowest and highest
// unique ID string.
SkString idString = SkPDFTagNode::nodeIdToString(node->fNodeId);
dict.insertByteString("ID", idString.c_str());
IDTreeEntry idTreeEntry = {node->fNodeId, ref};
fIdTreeEntries.push_back(idTreeEntry);
return doc->emit(dict, ref);
}
void SkPDFTagTree::addNodeAnnotation(int nodeId, SkPDFIndirectReference annotationRef, unsigned pageIndex) {
if (!fRoot) {
return;
}
SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
if (!tagPtr) {
return;
}
SkPDFTagNode* tag = *tagPtr;
SkASSERT(tag);
SkPDFTagNode::AnnotationInfo annotationInfo = {pageIndex, annotationRef};
tag->fAnnotations.push_back(annotationInfo);
}
void SkPDFTagTree::addNodeTitle(int nodeId, SkSpan<const char> title) {
if (!fRoot) {
return;
}
SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
if (!tagPtr) {
return;
}
SkPDFTagNode* tag = *tagPtr;
SkASSERT(tag);
if (tag->fWantTitle) {
tag->fTitle.append(title.data(), title.size());
// Arbitrary cutoff for size.
if (tag->fTitle.size() > 1023) {
tag->fWantTitle = false;
}
}
}
SkPDFIndirectReference SkPDFTagTree::makeStructTreeRoot(SkPDFDocument* doc) {
if (!fRoot || can_discard(fRoot)) {
return SkPDFIndirectReference();
}
SkPDFIndirectReference ref = doc->reserveRef();
unsigned pageCount = SkToUInt(doc->pageCount());
// Build the StructTreeRoot.
SkPDFDict structTreeRoot("StructTreeRoot");
structTreeRoot.insertRef("K", PrepareTagTreeToEmit(ref, fRoot, doc));
structTreeRoot.insertInt("ParentTreeNextKey", SkToInt(pageCount));
// Build the parent tree, which consists of two things:
// (1) For each page, a mapping from the marked content IDs on
// each page to their corresponding tags
// (2) For each annotation, an indirect reference to that
// annotation's struct tree element.
SkPDFDict parentTree("ParentTree");
auto parentTreeNums = SkPDFMakeArray();
// First, one entry per page.
SkASSERT(SkToUInt(fMarksPerPage.size()) <= pageCount);
for (int j = 0; j < fMarksPerPage.size(); ++j) {
const TArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[j];
SkPDFArray markToTagArray;
for (SkPDFTagNode* mark : pageMarks) {
SkASSERT(mark->fRef);
markToTagArray.appendRef(mark->fRef);
}
parentTreeNums->appendInt(j);
parentTreeNums->appendRef(doc->emit(markToTagArray));
}
// Then, one entry per annotation.
for (size_t j = 0; j < fParentTreeAnnotationNodeIds.size(); ++j) {
int nodeId = fParentTreeAnnotationNodeIds[j];
int structParentKey = kFirstAnnotationStructParentKey + static_cast<int>(j);
SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
if (!tagPtr) {
continue;
}
SkPDFTagNode* tag = *tagPtr;
parentTreeNums->appendInt(structParentKey);
parentTreeNums->appendRef(tag->fRef);
}
parentTree.insertObject("Nums", std::move(parentTreeNums));
structTreeRoot.insertRef("ParentTree", doc->emit(parentTree));
// Build the IDTree, a mapping from every unique ID string to
// a reference to its corresponding structure element node.
if (!fIdTreeEntries.empty()) {
std::sort(fIdTreeEntries.begin(), fIdTreeEntries.end(),
[](const IDTreeEntry& a, const IDTreeEntry& b) {
return a.nodeId < b.nodeId;
});
SkPDFDict idTree;
SkPDFDict idTreeLeaf;
auto limits = SkPDFMakeArray();
SkString lowestNodeIdString = SkPDFTagNode::nodeIdToString(
fIdTreeEntries.begin()->nodeId);
limits->appendByteString(lowestNodeIdString);
SkString highestNodeIdString = SkPDFTagNode::nodeIdToString(
fIdTreeEntries.rbegin()->nodeId);
limits->appendByteString(highestNodeIdString);
idTreeLeaf.insertObject("Limits", std::move(limits));
auto names = SkPDFMakeArray();
for (const IDTreeEntry& entry : fIdTreeEntries) {
SkString idString = SkPDFTagNode::nodeIdToString(entry.nodeId);
names->appendByteString(idString);
names->appendRef(entry.ref);
}
idTreeLeaf.insertObject("Names", std::move(names));
auto idTreeKids = SkPDFMakeArray();
idTreeKids->appendRef(doc->emit(idTreeLeaf));
idTree.insertObject("Kids", std::move(idTreeKids));
structTreeRoot.insertRef("IDTree", doc->emit(idTree));
}
return doc->emit(structTreeRoot, ref);
}
namespace {
struct OutlineEntry {
struct Content {
SkString fText;
Location fLocation;
void accumulate(Content const& child) {
fText += child.fText;
fLocation.accumulate(child.fLocation);
}
};
Content fContent;
int fHeaderLevel;
SkPDFIndirectReference fRef;
SkPDFIndirectReference fStructureRef;
std::vector<OutlineEntry> fChildren = {};
size_t fDescendentsEmitted = 0;
void emitDescendents(SkPDFDocument* const doc) {
fDescendentsEmitted = fChildren.size();
for (size_t i = 0; i < fChildren.size(); ++i) {
auto&& child = fChildren[i];
child.emitDescendents(doc);
fDescendentsEmitted += child.fDescendentsEmitted;
SkPDFDict entry;
entry.insertTextString("Title", child.fContent.fText);
auto destination = SkPDFMakeArray();
destination->appendRef(doc->getPage(child.fContent.fLocation.fPageIndex));
destination->appendName("XYZ");
destination->appendScalar(child.fContent.fLocation.fPoint.x());
destination->appendScalar(child.fContent.fLocation.fPoint.y());
destination->appendInt(0);
entry.insertObject("Dest", std::move(destination));
entry.insertRef("Parent", child.fRef);
if (child.fStructureRef) {
entry.insertRef("SE", child.fStructureRef);
}
if (0 < i) {
entry.insertRef("Prev", fChildren[i-1].fRef);
}
if (i < fChildren.size()-1) {
entry.insertRef("Next", fChildren[i+1].fRef);
}
if (!child.fChildren.empty()) {
entry.insertRef("First", child.fChildren.front().fRef);
entry.insertRef("Last", child.fChildren.back().fRef);
entry.insertInt("Count", child.fDescendentsEmitted);
}
doc->emit(entry, child.fRef);
}
}
};
OutlineEntry::Content create_outline_entry_content(SkPDFTagNode* const node) {
SkString text;
if (!node->fTitle.isEmpty()) {
text = node->fTitle;
} else if (!node->fAlt.isEmpty()) {
text = node->fAlt;
}
// The uppermost/leftmost point on the earliest page of this node's marks.
Location markPoint;
for (auto&& mark : node->fMarkedContent) {
markPoint.accumulate(mark.fLocation);
}
OutlineEntry::Content content{std::move(text), std::move(markPoint)};
// Accumulate children
SkSpan<SkPDFTagNode> children(node->fChildren, node->fChildCount);
for (auto&& child : children) {
if (can_discard(&child)) {
continue;
}
content.accumulate(create_outline_entry_content(&child));
}
return content;
}
void create_outline_from_headers(SkPDFDocument* const doc, SkPDFTagNode* const node,
STArray<7, OutlineEntry*>& stack) {
char const *type = node->fTypeString.c_str();
if (type[0] == 'H' && '1' <= type[1] && type[1] <= '6') {
int level = type[1] - '0';
while (level <= stack.back()->fHeaderLevel) {
stack.pop_back();
}
OutlineEntry::Content content = create_outline_entry_content(node);
if (!content.fText.isEmpty()) {
OutlineEntry e{std::move(content), level, doc->reserveRef(), node->fRef};
stack.push_back(&stack.back()->fChildren.emplace_back(std::move(e)));
return;
}
}
SkSpan<SkPDFTagNode> children(node->fChildren, node->fChildCount);
for (auto&& child : children) {
if (can_discard(&child)) {
continue;
}
create_outline_from_headers(doc, &child, stack);
}
}
} // namespace
SkPDFIndirectReference SkPDFTagTree::makeOutline(SkPDFDocument* doc) {
if (!fRoot || can_discard(fRoot) ||
fOutline != SkPDF::Metadata::Outline::StructureElementHeaders)
{
return SkPDFIndirectReference();
}
STArray<7, OutlineEntry*> stack;
OutlineEntry top{{SkString(), Location()}, 0, {}, {}};
stack.push_back(&top);
create_outline_from_headers(doc, fRoot, stack);
if (top.fChildren.empty()) {
return SkPDFIndirectReference();
}
top.emitDescendents(doc);
SkPDFIndirectReference outlineRef = doc->reserveRef();
SkPDFDict outline("Outlines");
outline.insertRef("First", top.fChildren.front().fRef);
outline.insertRef("Last", top.fChildren.back().fRef);
outline.insertInt("Count", top.fDescendentsEmitted);
return doc->emit(outline, outlineRef);
}
SkString SkPDFTagTree::getRootLanguage() {
return fRoot ? fRoot->fLang : SkString();
}