utils/pdftohtml.cc - third_party/poppler - Git at Google

 //========================================================================
 //
 // pdftohtml.cc
 //
 //
 // Copyright 1999-2000 G. Ovtcharov
 //========================================================================

 //========================================================================
 //
 // Modified under the Poppler project - http://poppler.freedesktop.org
 //
 // All changes made under the Poppler project to this file are licensed
 // under GPL version 2 or later
 //
 // Copyright (C) 2007-2008, 2010, 2012, 2015-2020, 2022 Albert Astals Cid <aacid@kde.org>
 // Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
 // Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
 // Copyright (C) 2010, 2013 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
 // Copyright (C) 2011 Steven Murdoch <Steven.Murdoch@cl.cam.ac.uk>
 // Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
 // Copyright (C) 2012 Ihar Filipau <thephilips@gmail.com>
 // Copyright (C) 2012 Luis Parravicini <lparravi@gmail.com>
 // Copyright (C) 2014 Pino Toscano <pino@kde.org>
 // Copyright (C) 2015 William Bader <williambader@hotmail.com>
 // Copyright (C) 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
 // Copyright (C) 2018 Thibaut Brard <thibaut.brard@gmail.com>
 // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
 // Copyright (C) 2019, 2021, 2024 Oliver Sander <oliver.sander@tu-dresden.de>
 // Copyright (C) 2021 Hubert Figuiere <hub@figuiere.net>
 // Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
 //
 //========================================================================

 #include "config.h"
 #include <poppler-config.h>
 #include <cstdio>
 #include <cstdlib>
 #include <cstddef>
 #include <cstring>
 #ifdef HAVE_DIRENT_H
 #    include <dirent.h>
 #endif
 #include <ctime>
 #include "parseargs.h"
 #include "goo/GooString.h"
 #include "goo/gbase64.h"
 #include "goo/gbasename.h"
 #include "goo/gmem.h"
 #include "Object.h"
 #include "Stream.h"
 #include "Array.h"
 #include "Dict.h"
 #include "XRef.h"
 #include "Catalog.h"
 #include "Page.h"
 #include "Outline.h"
 #include "PDFDoc.h"
 #include "PDFDocFactory.h"
 #include "HtmlOutputDev.h"
 #include "SplashOutputDev.h"
 #include "splash/SplashBitmap.h"
 #include "GlobalParams.h"
 #include "PDFDocEncoding.h"
 #include "Error.h"
 #include "DateInfo.h"
 #include "goo/gfile.h"
 #include "Win32Console.h"
 #include "InMemoryFile.h"
 #include "UTF.h"

 static int firstPage = 1;
 static int lastPage = 0;
 static bool rawOrder = true;
 bool printCommands = true;
 static bool printHelp = false;
 bool printHtml = false;
 bool complexMode = false;
 bool singleHtml = false; // singleHtml
 bool dataUrls = false;
 bool ignore = false;
 static char extension[5] = "png";
 static double scale = 1.5;
 bool noframes = false;
 bool stout = false;
 bool xml = false;
 bool noRoundedCoordinates = false;
 static bool errQuiet = false;
 static bool noDrm = false;
 double wordBreakThreshold = 10; // 10%, below converted into a coefficient - 0.1

 bool showHidden = false;
 bool noMerge = false;
 bool fontFullName = false;
 static char ownerPassword[33] = "";
 static char userPassword[33] = "";
 static bool printVersion = false;

 static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key);
 static GooString *getInfoDate(Dict *infoDict, const char *key);

 static char textEncName[128] = "";

 static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" },
                                    { "-l", argInt, &lastPage, 0, "last page to convert" },
                                    /*{"-raw",    argFlag,     &rawOrder,      0,
                                      "keep strings in content stream order"},*/
                                    { "-q", argFlag, &errQuiet, 0, "don't print any messages or errors" },
                                    { "-h", argFlag, &printHelp, 0, "print usage information" },
                                    { "-?", argFlag, &printHelp, 0, "print usage information" },
                                    { "-help", argFlag, &printHelp, 0, "print usage information" },
                                    { "--help", argFlag, &printHelp, 0, "print usage information" },
                                    { "-p", argFlag, &printHtml, 0, "exchange .pdf links by .html" },
                                    { "-c", argFlag, &complexMode, 0, "generate complex document" },
                                    { "-s", argFlag, &singleHtml, 0, "generate single document that includes all pages" },
 #ifdef HAVE_IN_MEMORY_FILE
                                    { "-dataurls", argFlag, &dataUrls, 0, "use data URLs instead of external images in HTML" },
 #endif
                                    { "-i", argFlag, &ignore, 0, "ignore images" },
                                    { "-noframes", argFlag, &noframes, 0, "generate no frames" },
                                    { "-stdout", argFlag, &stout, 0, "use standard output" },
                                    { "-zoom", argFP, &scale, 0, "zoom the pdf document (default 1.5)" },
                                    { "-xml", argFlag, &xml, 0, "output for XML post-processing" },
                                    { "-noroundcoord", argFlag, &noRoundedCoordinates, 0, "do not round coordinates (with XML output only)" },
                                    { "-hidden", argFlag, &showHidden, 0, "output hidden text" },
                                    { "-nomerge", argFlag, &noMerge, 0, "do not merge paragraphs" },
                                    { "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" },
                                    { "-fmt", argString, extension, sizeof(extension), "image file format for Splash output (png or jpg)" },
                                    { "-v", argFlag, &printVersion, 0, "print copyright and version info" },
                                    { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
                                    { "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
                                    { "-nodrm", argFlag, &noDrm, 0, "override document DRM settings" },
                                    { "-wbt", argFP, &wordBreakThreshold, 0, "word break threshold (default 10 percent)" },
                                    { "-fontfullname", argFlag, &fontFullName, 0, "outputs font full name" },
                                    {} };

 class SplashOutputDevNoText : public SplashOutputDev
 {
 public:
     SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA, bool reverseVideoA, SplashColorPtr paperColorA, bool bitmapTopDownA = true)
         : SplashOutputDev(colorModeA, bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA) { }
     ~SplashOutputDevNoText() override;

     void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode *u, int uLen) override { }
     bool beginType3Char(GfxState *state, double x, double y, double dx, double dy, CharCode code, const Unicode *u, int uLen) override { return false; }
     void endType3Char(GfxState *state) override { }
     void beginTextObject(GfxState *state) override { }
     void endTextObject(GfxState *state) override { }
     bool interpretType3Chars() override { return false; }
 };

 SplashOutputDevNoText::~SplashOutputDevNoText() = default;

 int main(int argc, char *argv[])
 {
     std::unique_ptr<PDFDoc> doc;
     GooString *fileName = nullptr;
     std::unique_ptr<GooString> docTitle;
     std::unique_ptr<GooString> author;
     std::unique_ptr<GooString> keywords;
     std::unique_ptr<GooString> subject;
     GooString *date = nullptr;
     GooString *htmlFileName = nullptr;
     HtmlOutputDev *htmlOut = nullptr;
     SplashOutputDev *splashOut = nullptr;
     bool doOutline;
     bool ok;
     std::optional<GooString> ownerPW, userPW;
     Object info;
     int exit_status = EXIT_FAILURE;

     Win32Console win32Console(&argc, &argv);
     // parse args
     ok = parseArgs(argDesc, &argc, argv);
     if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
         fprintf(stderr, "pdftohtml version %s\n", PACKAGE_VERSION);
         fprintf(stderr, "%s\n", popplerCopyright);
         fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
         fprintf(stderr, "%s\n\n", xpdfCopyright);
         if (!printVersion) {
             printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
         }
         exit(printHelp || printVersion ? 0 : 1);
     }

     // init error file
     // errorInit();

     // read config file
     globalParams = std::make_unique<GlobalParams>();

     if (errQuiet) {
         globalParams->setErrQuiet(errQuiet);
         printCommands = false; // I'm not 100% what is the difference between them
     }

     if (textEncName[0]) {
         globalParams->setTextEncoding(textEncName);
         if (!globalParams->getTextEncoding()) {
             goto error;
         }
     }

     // convert from user-friendly percents into a coefficient
     wordBreakThreshold /= 100.0;

     // open PDF file
     if (ownerPassword[0]) {
         ownerPW = GooString(ownerPassword);
     }
     if (userPassword[0]) {
         userPW = GooString(userPassword);
     }

     fileName = new GooString(argv[1]);

     if (fileName->cmp("-") == 0) {
         delete fileName;
         fileName = new GooString("fd://0");
     }

     doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);

     if (!doc->isOk()) {
         goto error;
     }

     // check for copy permission
     if (!doc->okToCopy()) {
         if (!noDrm) {
             error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
             goto error;
         }
         fprintf(stderr, "Document has copy-protection bit set.\n");
     }

     // construct text file name
     if (argc == 3) {
         GooString *tmp = new GooString(argv[2]);
         if (!xml) {
             if (tmp->getLength() >= 5) {
                 const char *p = tmp->c_str() + tmp->getLength() - 5;
                 if (!strcmp(p, ".html") || !strcmp(p, ".HTML")) {
                     htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 5);
                 }
             }
         } else {
             if (tmp->getLength() >= 4) {
                 const char *p = tmp->c_str() + tmp->getLength() - 4;
                 if (!strcmp(p, ".xml") || !strcmp(p, ".XML")) {
                     htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 4);
                 }
             }
         }
         if (!htmlFileName) {
             htmlFileName = new GooString(tmp);
         }
         delete tmp;
     } else if (fileName->cmp("fd://0") == 0) {
         error(errCommandLine, -1, "You have to provide an output filename when reading from stdin.");
         goto error;
     } else {
         const char *p = fileName->c_str() + fileName->getLength() - 4;
         if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
             htmlFileName = new GooString(fileName->c_str(), fileName->getLength() - 4);
         } else {
             htmlFileName = fileName->copy();
         }
         //   htmlFileName->append(".html");
     }

     if (scale > 3.0) {
         scale = 3.0;
     }
     if (scale < 0.5) {
         scale = 0.5;
     }

     if (complexMode) {
         // noframes=false;
         stout = false;
     }

     if (stout) {
         noframes = true;
         complexMode = false;
     }

     if (xml) {
         complexMode = true;
         singleHtml = false;
         noframes = true;
         noMerge = true;
     }

     // get page range
     if (firstPage < 1) {
         firstPage = 1;
     }
     if (lastPage < 1 || lastPage > doc->getNumPages()) {
         lastPage = doc->getNumPages();
     }
     if (lastPage < firstPage) {
         error(errCommandLine, -1, "Wrong page range given: the first page ({0:d}) can not be after the last page ({1:d}).", firstPage, lastPage);
         goto error;
     }

     info = doc->getDocInfo();
     if (info.isDict()) {
         docTitle = getInfoString(info.getDict(), "Title");
         author = getInfoString(info.getDict(), "Author");
         keywords = getInfoString(info.getDict(), "Keywords");
         subject = getInfoString(info.getDict(), "Subject");
         date = getInfoDate(info.getDict(), "ModDate");
         if (!date) {
             date = getInfoDate(info.getDict(), "CreationDate");
         }
     }
     if (!docTitle) {
         docTitle = std::make_unique<GooString>(htmlFileName);
     }

     if (!singleHtml) {
         rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
     } else {
         rawOrder = singleHtml;
     }

     doOutline = doc->getOutline()->getItems() != nullptr;
     // write text file
     htmlOut = new HtmlOutputDev(doc->getCatalog(), htmlFileName->c_str(), docTitle->c_str(), author ? author->c_str() : nullptr, keywords ? keywords->c_str() : nullptr, subject ? subject->c_str() : nullptr, date ? date->c_str() : nullptr,
                                 rawOrder, firstPage, doOutline);
     if (date) {
         delete date;
     }

     if ((complexMode || singleHtml) && !xml && !ignore) {
         // White paper color
         SplashColor color;
         color[0] = color[1] = color[2] = 255;
         // If the user specified "jpg" use JPEG, otherwise PNG
         SplashImageFileFormat format = strcmp(extension, "jpg") ? splashFormatPng : splashFormatJpeg;

         splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, false, color);
         splashOut->startDoc(doc.get());

         for (int pg = firstPage; pg <= lastPage; ++pg) {
             InMemoryFile imf;
             doc->displayPage(splashOut, pg, 72 * scale, 72 * scale, 0, true, false, false);
             SplashBitmap *bitmap = splashOut->getBitmap();

             const std::string imgFileName = GooString::format("{0:s}{1:03d}.{2:s}", htmlFileName->c_str(), pg, extension);
             auto f1 = dataUrls ? imf.open("wb") : fopen(imgFileName.c_str(), "wb");
             if (!f1) {
                 fprintf(stderr, "Could not open %s\n", imgFileName.c_str());
                 continue;
             }
             bitmap->writeImgFile(format, f1, 72 * scale, 72 * scale);
             fclose(f1);
             if (dataUrls) {
                 htmlOut->addBackgroundImage(std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") + gbase64Encode(imf.getBuffer()));
             } else {
                 htmlOut->addBackgroundImage(gbasename(imgFileName.c_str()));
             }
         }

         delete splashOut;
     }

     if (htmlOut->isOk()) {
         doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, true, false, false);
         htmlOut->dumpDocOutline(doc.get());
     }

     delete htmlOut;

     exit_status = EXIT_SUCCESS;

     // clean up
 error:
     delete fileName;

     if (htmlFileName) {
         delete htmlFileName;
     }

     return exit_status;
 }

 static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key)
 {
     Object obj;
     // Raw value as read from PDF (may be in pdfDocEncoding or UCS2)
     const GooString *rawString;
     // Value converted to unicode
     Unicode *unicodeString;
     int unicodeLength;
     // Value HTML escaped and converted to desired encoding
     std::unique_ptr<GooString> encodedString;
     // Is rawString UCS2 (as opposed to pdfDocEncoding)
     bool isUnicode;

     obj = infoDict->lookup(key);
     if (obj.isString()) {
         rawString = obj.getString();

         // Convert rawString to unicode
         if (hasUnicodeByteOrderMark(rawString->toStr())) {
             isUnicode = true;
             unicodeLength = (obj.getString()->getLength() - 2) / 2;
         } else {
             isUnicode = false;
             unicodeLength = obj.getString()->getLength();
         }
         unicodeString = new Unicode[unicodeLength];

         for (int i = 0; i < unicodeLength; i++) {
             if (isUnicode) {
                 unicodeString[i] = ((rawString->getChar((i + 1) * 2) & 0xff) << 8) | (rawString->getChar(((i + 1) * 2) + 1) & 0xff);
             } else {
                 unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff];
             }
         }

         // HTML escape and encode unicode
         encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength);
         delete[] unicodeString;
     }

     return encodedString;
 }

 static GooString *getInfoDate(Dict *infoDict, const char *key)
 {
     Object obj;
     int year, mon, day, hour, min, sec, tz_hour, tz_minute;
     char tz;
     struct tm tmStruct;
     GooString *result = nullptr;
     char buf[256];

     obj = infoDict->lookup(key);
     if (obj.isString()) {
         const GooString *s = obj.getString();
         // TODO do something with the timezone info
         if (parseDateString(s, &year, &mon, &day, &hour, &min, &sec, &tz, &tz_hour, &tz_minute)) {
             tmStruct.tm_year = year - 1900;
             tmStruct.tm_mon = mon - 1;
             tmStruct.tm_mday = day;
             tmStruct.tm_hour = hour;
             tmStruct.tm_min = min;
             tmStruct.tm_sec = sec;
             tmStruct.tm_wday = -1;
             tmStruct.tm_yday = -1;
             tmStruct.tm_isdst = -1;
             mktime(&tmStruct); // compute the tm_wday and tm_yday fields
             if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
                 result = new GooString(buf);
             } else {
                 result = new GooString(s);
             }
         } else {
             result = new GooString(s);
         }
     }
     return result;
 }
	//========================================================================
	//
	// pdftohtml.cc
	//
	//
	// Copyright 1999-2000 G. Ovtcharov
	//========================================================================

	//========================================================================
	//
	// Modified under the Poppler project - http://poppler.freedesktop.org
	//
	// All changes made under the Poppler project to this file are licensed
	// under GPL version 2 or later
	//
	// Copyright (C) 2007-2008, 2010, 2012, 2015-2020, 2022 Albert Astals Cid <aacid@kde.org>
	// Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
	// Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
	// Copyright (C) 2010, 2013 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
	// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
	// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch@cl.cam.ac.uk>
	// Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
	// Copyright (C) 2012 Ihar Filipau <thephilips@gmail.com>
	// Copyright (C) 2012 Luis Parravicini <lparravi@gmail.com>
	// Copyright (C) 2014 Pino Toscano <pino@kde.org>
	// Copyright (C) 2015 William Bader <williambader@hotmail.com>
	// Copyright (C) 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
	// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
	// Copyright (C) 2018 Thibaut Brard <thibaut.brard@gmail.com>
	// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
	// Copyright (C) 2019, 2021, 2024 Oliver Sander <oliver.sander@tu-dresden.de>
	// Copyright (C) 2021 Hubert Figuiere <hub@figuiere.net>
	// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
	//
	// To see a description of the changes please see the Changelog file that
	// came with your tarball or type make ChangeLog if you are building from git
	//
	//========================================================================

	#include "config.h"
	#include <poppler-config.h>
	#include <cstdio>
	#include <cstdlib>
	#include <cstddef>
	#include <cstring>
	#ifdef HAVE_DIRENT_H
	# include <dirent.h>
	#endif
	#include <ctime>
	#include "parseargs.h"
	#include "goo/GooString.h"
	#include "goo/gbase64.h"
	#include "goo/gbasename.h"
	#include "goo/gmem.h"
	#include "Object.h"
	#include "Stream.h"
	#include "Array.h"
	#include "Dict.h"
	#include "XRef.h"
	#include "Catalog.h"
	#include "Page.h"
	#include "Outline.h"
	#include "PDFDoc.h"
	#include "PDFDocFactory.h"
	#include "HtmlOutputDev.h"
	#include "SplashOutputDev.h"
	#include "splash/SplashBitmap.h"
	#include "GlobalParams.h"
	#include "PDFDocEncoding.h"
	#include "Error.h"
	#include "DateInfo.h"
	#include "goo/gfile.h"
	#include "Win32Console.h"
	#include "InMemoryFile.h"
	#include "UTF.h"

	static int firstPage = 1;
	static int lastPage = 0;
	static bool rawOrder = true;
	bool printCommands = true;
	static bool printHelp = false;
	bool printHtml = false;
	bool complexMode = false;
	bool singleHtml = false; // singleHtml
	bool dataUrls = false;
	bool ignore = false;
	static char extension[5] = "png";
	static double scale = 1.5;
	bool noframes = false;
	bool stout = false;
	bool xml = false;
	bool noRoundedCoordinates = false;
	static bool errQuiet = false;
	static bool noDrm = false;
	double wordBreakThreshold = 10; // 10%, below converted into a coefficient - 0.1

	bool showHidden = false;
	bool noMerge = false;
	bool fontFullName = false;
	static char ownerPassword[33] = "";
	static char userPassword[33] = "";
	static bool printVersion = false;

	static std::unique_ptr<GooString> getInfoString(Dict infoDict, const char key);
	static GooString getInfoDate(Dict infoDict, const char *key);

	static char textEncName[128] = "";

	static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" },
	{ "-l", argInt, &lastPage, 0, "last page to convert" },
	/*{"-raw", argFlag, &rawOrder, 0,
	"keep strings in content stream order"},*/
	{ "-q", argFlag, &errQuiet, 0, "don't print any messages or errors" },
	{ "-h", argFlag, &printHelp, 0, "print usage information" },
	{ "-?", argFlag, &printHelp, 0, "print usage information" },
	{ "-help", argFlag, &printHelp, 0, "print usage information" },
	{ "--help", argFlag, &printHelp, 0, "print usage information" },
	{ "-p", argFlag, &printHtml, 0, "exchange .pdf links by .html" },
	{ "-c", argFlag, &complexMode, 0, "generate complex document" },
	{ "-s", argFlag, &singleHtml, 0, "generate single document that includes all pages" },
	#ifdef HAVE_IN_MEMORY_FILE
	{ "-dataurls", argFlag, &dataUrls, 0, "use data URLs instead of external images in HTML" },
	#endif
	{ "-i", argFlag, &ignore, 0, "ignore images" },
	{ "-noframes", argFlag, &noframes, 0, "generate no frames" },
	{ "-stdout", argFlag, &stout, 0, "use standard output" },
	{ "-zoom", argFP, &scale, 0, "zoom the pdf document (default 1.5)" },
	{ "-xml", argFlag, &xml, 0, "output for XML post-processing" },
	{ "-noroundcoord", argFlag, &noRoundedCoordinates, 0, "do not round coordinates (with XML output only)" },
	{ "-hidden", argFlag, &showHidden, 0, "output hidden text" },
	{ "-nomerge", argFlag, &noMerge, 0, "do not merge paragraphs" },
	{ "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" },
	{ "-fmt", argString, extension, sizeof(extension), "image file format for Splash output (png or jpg)" },
	{ "-v", argFlag, &printVersion, 0, "print copyright and version info" },
	{ "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
	{ "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
	{ "-nodrm", argFlag, &noDrm, 0, "override document DRM settings" },
	{ "-wbt", argFP, &wordBreakThreshold, 0, "word break threshold (default 10 percent)" },
	{ "-fontfullname", argFlag, &fontFullName, 0, "outputs font full name" },
	{} };

	class SplashOutputDevNoText : public SplashOutputDev
	{
	public:
	SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA, bool reverseVideoA, SplashColorPtr paperColorA, bool bitmapTopDownA = true)
	: SplashOutputDev(colorModeA, bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA) { }
	~SplashOutputDevNoText() override;

	void drawChar(GfxState state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode u, int uLen) override { }
	bool beginType3Char(GfxState state, double x, double y, double dx, double dy, CharCode code, const Unicode u, int uLen) override { return false; }
	void endType3Char(GfxState *state) override { }
	void beginTextObject(GfxState *state) override { }
	void endTextObject(GfxState *state) override { }
	bool interpretType3Chars() override { return false; }
	};

	SplashOutputDevNoText::~SplashOutputDevNoText() = default;

	int main(int argc, char *argv[])
	{
	std::unique_ptr<PDFDoc> doc;
	GooString *fileName = nullptr;
	std::unique_ptr<GooString> docTitle;
	std::unique_ptr<GooString> author;
	std::unique_ptr<GooString> keywords;
	std::unique_ptr<GooString> subject;
	GooString *date = nullptr;
	GooString *htmlFileName = nullptr;
	HtmlOutputDev *htmlOut = nullptr;
	SplashOutputDev *splashOut = nullptr;
	bool doOutline;
	bool ok;
	std::optional<GooString> ownerPW, userPW;
	Object info;
	int exit_status = EXIT_FAILURE;

	Win32Console win32Console(&argc, &argv);
	// parse args
	ok = parseArgs(argDesc, &argc, argv);
	if (!ok \|\| argc < 2 \|\| argc > 3 \|\| printHelp \|\| printVersion) {
	fprintf(stderr, "pdftohtml version %s\n", PACKAGE_VERSION);
	fprintf(stderr, "%s\n", popplerCopyright);
	fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
	fprintf(stderr, "%s\n\n", xpdfCopyright);
	if (!printVersion) {
	printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
	}
	exit(printHelp \|\| printVersion ? 0 : 1);
	}

	// init error file
	// errorInit();

	// read config file
	globalParams = std::make_unique<GlobalParams>();

	if (errQuiet) {
	globalParams->setErrQuiet(errQuiet);
	printCommands = false; // I'm not 100% what is the difference between them
	}

	if (textEncName[0]) {
	globalParams->setTextEncoding(textEncName);
	if (!globalParams->getTextEncoding()) {
	goto error;
	}
	}

	// convert from user-friendly percents into a coefficient
	wordBreakThreshold /= 100.0;

	// open PDF file
	if (ownerPassword[0]) {
	ownerPW = GooString(ownerPassword);
	}
	if (userPassword[0]) {
	userPW = GooString(userPassword);
	}

	fileName = new GooString(argv[1]);

	if (fileName->cmp("-") == 0) {
	delete fileName;
	fileName = new GooString("fd://0");
	}

	doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);

	if (!doc->isOk()) {
	goto error;
	}

	// check for copy permission
	if (!doc->okToCopy()) {
	if (!noDrm) {
	error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
	goto error;
	}
	fprintf(stderr, "Document has copy-protection bit set.\n");
	}

	// construct text file name
	if (argc == 3) {
	GooString *tmp = new GooString(argv[2]);
	if (!xml) {
	if (tmp->getLength() >= 5) {
	const char *p = tmp->c_str() + tmp->getLength() - 5;
	if (!strcmp(p, ".html") \|\| !strcmp(p, ".HTML")) {
	htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 5);
	}
	}
	} else {
	if (tmp->getLength() >= 4) {
	const char *p = tmp->c_str() + tmp->getLength() - 4;
	if (!strcmp(p, ".xml") \|\| !strcmp(p, ".XML")) {
	htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 4);
	}
	}
	}
	if (!htmlFileName) {
	htmlFileName = new GooString(tmp);
	}
	delete tmp;
	} else if (fileName->cmp("fd://0") == 0) {
	error(errCommandLine, -1, "You have to provide an output filename when reading from stdin.");
	goto error;
	} else {
	const char *p = fileName->c_str() + fileName->getLength() - 4;
	if (!strcmp(p, ".pdf") \|\| !strcmp(p, ".PDF")) {
	htmlFileName = new GooString(fileName->c_str(), fileName->getLength() - 4);
	} else {
	htmlFileName = fileName->copy();
	}
	// htmlFileName->append(".html");
	}

	if (scale > 3.0) {
	scale = 3.0;
	}
	if (scale < 0.5) {
	scale = 0.5;
	}

	if (complexMode) {
	// noframes=false;
	stout = false;
	}

	if (stout) {
	noframes = true;
	complexMode = false;
	}

	if (xml) {
	complexMode = true;
	singleHtml = false;
	noframes = true;
	noMerge = true;
	}

	// get page range
	if (firstPage < 1) {
	firstPage = 1;
	}
	if (lastPage < 1 \|\| lastPage > doc->getNumPages()) {
	lastPage = doc->getNumPages();
	}
	if (lastPage < firstPage) {
	error(errCommandLine, -1, "Wrong page range given: the first page ({0:d}) can not be after the last page ({1:d}).", firstPage, lastPage);
	goto error;
	}

	info = doc->getDocInfo();
	if (info.isDict()) {
	docTitle = getInfoString(info.getDict(), "Title");
	author = getInfoString(info.getDict(), "Author");
	keywords = getInfoString(info.getDict(), "Keywords");
	subject = getInfoString(info.getDict(), "Subject");
	date = getInfoDate(info.getDict(), "ModDate");
	if (!date) {
	date = getInfoDate(info.getDict(), "CreationDate");
	}
	}
	if (!docTitle) {
	docTitle = std::make_unique<GooString>(htmlFileName);
	}

	if (!singleHtml) {
	rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
	} else {
	rawOrder = singleHtml;
	}

	doOutline = doc->getOutline()->getItems() != nullptr;
	// write text file
	htmlOut = new HtmlOutputDev(doc->getCatalog(), htmlFileName->c_str(), docTitle->c_str(), author ? author->c_str() : nullptr, keywords ? keywords->c_str() : nullptr, subject ? subject->c_str() : nullptr, date ? date->c_str() : nullptr,
	rawOrder, firstPage, doOutline);
	if (date) {
	delete date;
	}

	if ((complexMode \|\| singleHtml) && !xml && !ignore) {
	// White paper color
	SplashColor color;
	color[0] = color[1] = color[2] = 255;
	// If the user specified "jpg" use JPEG, otherwise PNG
	SplashImageFileFormat format = strcmp(extension, "jpg") ? splashFormatPng : splashFormatJpeg;

	splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, false, color);
	splashOut->startDoc(doc.get());

	for (int pg = firstPage; pg <= lastPage; ++pg) {
	InMemoryFile imf;
	doc->displayPage(splashOut, pg, 72 * scale, 72 * scale, 0, true, false, false);
	SplashBitmap *bitmap = splashOut->getBitmap();

	const std::string imgFileName = GooString::format("{0:s}{1:03d}.{2:s}", htmlFileName->c_str(), pg, extension);
	auto f1 = dataUrls ? imf.open("wb") : fopen(imgFileName.c_str(), "wb");
	if (!f1) {
	fprintf(stderr, "Could not open %s\n", imgFileName.c_str());
	continue;
	}
	bitmap->writeImgFile(format, f1, 72 * scale, 72 * scale);
	fclose(f1);
	if (dataUrls) {
	htmlOut->addBackgroundImage(std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") + gbase64Encode(imf.getBuffer()));
	} else {
	htmlOut->addBackgroundImage(gbasename(imgFileName.c_str()));
	}
	}

	delete splashOut;
	}

	if (htmlOut->isOk()) {
	doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, true, false, false);
	htmlOut->dumpDocOutline(doc.get());
	}

	delete htmlOut;

	exit_status = EXIT_SUCCESS;

	// clean up
	error:
	delete fileName;

	if (htmlFileName) {
	delete htmlFileName;
	}

	return exit_status;
	}

	static std::unique_ptr<GooString> getInfoString(Dict infoDict, const char key)
	{
	Object obj;
	// Raw value as read from PDF (may be in pdfDocEncoding or UCS2)
	const GooString *rawString;
	// Value converted to unicode
	Unicode *unicodeString;
	int unicodeLength;
	// Value HTML escaped and converted to desired encoding
	std::unique_ptr<GooString> encodedString;
	// Is rawString UCS2 (as opposed to pdfDocEncoding)
	bool isUnicode;

	obj = infoDict->lookup(key);
	if (obj.isString()) {
	rawString = obj.getString();

	// Convert rawString to unicode
	if (hasUnicodeByteOrderMark(rawString->toStr())) {
	isUnicode = true;
	unicodeLength = (obj.getString()->getLength() - 2) / 2;
	} else {
	isUnicode = false;
	unicodeLength = obj.getString()->getLength();
	}
	unicodeString = new Unicode[unicodeLength];

	for (int i = 0; i < unicodeLength; i++) {
	if (isUnicode) {
	unicodeString[i] = ((rawString->getChar((i + 1) * 2) & 0xff) << 8) \| (rawString->getChar(((i + 1) * 2) + 1) & 0xff);
	} else {
	unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff];
	}
	}

	// HTML escape and encode unicode
	encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength);
	delete[] unicodeString;
	}

	return encodedString;
	}

	static GooString getInfoDate(Dict infoDict, const char *key)
	{
	Object obj;
	int year, mon, day, hour, min, sec, tz_hour, tz_minute;
	char tz;
	struct tm tmStruct;
	GooString *result = nullptr;
	char buf[256];

	obj = infoDict->lookup(key);
	if (obj.isString()) {
	const GooString *s = obj.getString();
	// TODO do something with the timezone info
	if (parseDateString(s, &year, &mon, &day, &hour, &min, &sec, &tz, &tz_hour, &tz_minute)) {
	tmStruct.tm_year = year - 1900;
	tmStruct.tm_mon = mon - 1;
	tmStruct.tm_mday = day;
	tmStruct.tm_hour = hour;
	tmStruct.tm_min = min;
	tmStruct.tm_sec = sec;
	tmStruct.tm_wday = -1;
	tmStruct.tm_yday = -1;
	tmStruct.tm_isdst = -1;
	mktime(&tmStruct); // compute the tm_wday and tm_yday fields
	if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
	result = new GooString(buf);
	} else {
	result = new GooString(s);
	}
	} else {
	result = new GooString(s);
	}
	}
	return result;
	}