poppler/Parser.cc - third_party/poppler - Git at Google

 //========================================================================
 //
 // Parser.cc
 //
 // Copyright 1996-2003 Glyph & Cog, LLC
 //
 //========================================================================

 //========================================================================
 //
 // Modified under the Poppler project - http://poppler.freedesktop.org
 //
 // All changes made under the Poppler project to this file are licensed
 // under GPL version 2 or later
 //
 // Copyright (C) 2006, 2009, 201, 2010 Albert Astals Cid <aacid@kde.org>
 // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
 // Copyright (C) 2009 Ilya Gorenbein <igorenbein@finjan.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
 //
 //========================================================================

 #include <config.h>

 #ifdef USE_GCC_PRAGMAS
 #pragma implementation
 #endif

 #include <stddef.h>
 #include "Object.h"
 #include "Array.h"
 #include "Dict.h"
 #include "Decrypt.h"
 #include "Parser.h"
 #include "XRef.h"
 #include "Error.h"

 Parser::Parser(XRef *xrefA, Lexer *lexerA, GBool allowStreamsA) {
   xref = xrefA;
   lexer = lexerA;
   inlineImg = 0;
   allowStreams = allowStreamsA;
   lexer->getObj(&buf1);
   lexer->getObj(&buf2);
 }

 Parser::~Parser() {
   buf1.free();
   buf2.free();
   delete lexer;
 }

 Object *Parser::getObj(Object *obj, Guchar *fileKey,
            CryptAlgorithm encAlgorithm, int keyLength,
            int objNum, int objGen) {
   std::set<int> fetchOriginatorNums;
   return getObj(obj, fileKey, encAlgorithm, keyLength, objNum, objGen, &fetchOriginatorNums);
 }

 Object *Parser::getObj(Object *obj, std::set<int> *fetchOriginatorNums)
 {
   return getObj(obj, NULL, cryptRC4, 0, 0, 0, fetchOriginatorNums);
 }

 Object *Parser::getObj(Object *obj, Guchar *fileKey,
 		       CryptAlgorithm encAlgorithm, int keyLength,
 		       int objNum, int objGen, std::set<int> *fetchOriginatorNums) {
   char *key;
   Stream *str;
   Object obj2;
   int num;
   DecryptStream *decrypt;
   GooString *s, *s2;
   int c;

   // refill buffer after inline image data
   if (inlineImg == 2) {
     buf1.free();
     buf2.free();
     lexer->getObj(&buf1);
     lexer->getObj(&buf2);
     inlineImg = 0;
   }

   // array
   if (buf1.isCmd("[")) {
     shift();
     obj->initArray(xref);
     while (!buf1.isCmd("]") && !buf1.isEOF())
       obj->arrayAdd(getObj(&obj2, fileKey, encAlgorithm, keyLength,
 			   objNum, objGen, fetchOriginatorNums));
     if (buf1.isEOF())
       error(getPos(), "End of file inside array");
     shift();

   // dictionary or stream
   } else if (buf1.isCmd("<<")) {
     shift(objNum);
     obj->initDict(xref);
     while (!buf1.isCmd(">>") && !buf1.isEOF()) {
       if (!buf1.isName()) {
 	error(getPos(), "Dictionary key must be a name object");
 	shift();
       } else {
 	// buf1 might go away in shift(), so construct the key
 	key = copyString(buf1.getName());
 	shift();
 	if (buf1.isEOF() || buf1.isError()) {
 	  gfree(key);
 	  break;
 	}
 	obj->dictAdd(key, getObj(&obj2, fileKey, encAlgorithm, keyLength, objNum, objGen, fetchOriginatorNums));
       }
     }
     if (buf1.isEOF())
       error(getPos(), "End of file inside dictionary");
     // stream objects are not allowed inside content streams or
     // object streams
     if (allowStreams && buf2.isCmd("stream")) {
       if ((str = makeStream(obj, fileKey, encAlgorithm, keyLength,
 			    objNum, objGen, fetchOriginatorNums))) {
 	obj->initStream(str);
       } else {
 	obj->free();
 	obj->initError();
       }
     } else {
       shift();
     }

   // indirect reference or integer
   } else if (buf1.isInt()) {
     num = buf1.getInt();
     shift();
     if (buf1.isInt() && buf2.isCmd("R")) {
       obj->initRef(num, buf1.getInt());
       shift();
       shift();
     } else {
       obj->initInt(num);
     }

   // string
   } else if (buf1.isString() && fileKey) {
     s = buf1.getString();
     s2 = new GooString();
     obj2.initNull();
     decrypt = new DecryptStream(new MemStream(s->getCString(), 0,
 					      s->getLength(), &obj2),
 				fileKey, encAlgorithm, keyLength,
 				objNum, objGen);
     decrypt->reset();
     while ((c = decrypt->getChar()) != EOF) {
       s2->append((char)c);
     }
     delete decrypt;
     obj->initString(s2);
     shift();

   // simple object
   } else {
     // avoid re-allocating memory for complex objects like strings by
     // shallow copy of <buf1> to <obj> and nulling <buf1> so that
     // subsequent buf1.free() won't free this memory
     buf1.shallowCopy(obj);
     buf1.initNull();
     shift();
   }

   return obj;
 }

 Stream *Parser::makeStream(Object *dict, Guchar *fileKey,
 			   CryptAlgorithm encAlgorithm, int keyLength,
 			   int objNum, int objGen, std::set<int> *fetchOriginatorNums) {
   Object obj;
   BaseStream *baseStr;
   Stream *str;
   Guint pos, endPos, length;

   // get stream start position
   lexer->skipToNextLine();
   pos = lexer->getPos();

   // get length
   dict->dictLookup("Length", &obj, fetchOriginatorNums);
   if (obj.isInt()) {
     length = (Guint)obj.getInt();
     obj.free();
   } else {
     error(getPos(), "Bad 'Length' attribute in stream");
     obj.free();
     length = 0;
   }

   // check for length in damaged file
   if (xref && xref->getStreamEnd(pos, &endPos)) {
     length = endPos - pos;
   }

   // in badly damaged PDF files, we can run off the end of the input
   // stream immediately after the "stream" token
   if (!lexer->getStream()) {
     return NULL;
   }
   baseStr = lexer->getStream()->getBaseStream();

   // skip over stream data
   if (Lexer::LOOK_VALUE_NOT_CACHED != lexer->lookCharLastValueCached) {
       // take into account the fact that we've cached one value
       pos = pos - 1;
       lexer->lookCharLastValueCached = Lexer::LOOK_VALUE_NOT_CACHED;
   }
   lexer->setPos(pos + length);

   // refill token buffers and check for 'endstream'
   shift();  // kill '>>'
   shift();  // kill 'stream'
   if (buf1.isCmd("endstream")) {
     shift();
   } else {
     error(getPos(), "Missing 'endstream'");
     if (xref) {
       // shift until we find the proper endstream or we change to another object or reach eof
       while (!buf1.isCmd("endstream") && xref->getNumEntry(lexer->getPos()) == objNum && !buf1.isEOF()) {
         shift();
       }
       length = lexer->getPos() - pos;
       if (buf1.isCmd("endstream")) {
         obj.initInt(length);
         dict->dictSet("Length", &obj);
         obj.free();
       }
     } else {
       // When building the xref we can't use it so use this
       // kludge for broken PDF files: just add 5k to the length, and
       // hope its enough
       length += 5000;
     }
   }

   // make base stream
   str = baseStr->makeSubStream(pos, gTrue, length, dict);

   // handle decryption
   if (fileKey) {
     str = new DecryptStream(str, fileKey, encAlgorithm, keyLength,
 			    objNum, objGen);
   }

   // get filters
   str = str->addFilters(dict);

   return str;
 }

 void Parser::shift(int objNum) {
   if (inlineImg > 0) {
     if (inlineImg < 2) {
       ++inlineImg;
     } else {
       // in a damaged content stream, if 'ID' shows up in the middle
       // of a dictionary, we need to reset
       inlineImg = 0;
     }
   } else if (buf2.isCmd("ID")) {
     lexer->skipChar();		// skip char after 'ID' command
     inlineImg = 1;
   }
   buf1.free();
   buf2.shallowCopy(&buf1);
   if (inlineImg > 0)		// don't buffer inline image data
     buf2.initNull();
   else
     lexer->getObj(&buf2, objNum);
 }
	//========================================================================
	//
	// Parser.cc
	//
	// Copyright 1996-2003 Glyph & Cog, LLC
	//
	//========================================================================

	//========================================================================
	//
	// Modified under the Poppler project - http://poppler.freedesktop.org
	//
	// All changes made under the Poppler project to this file are licensed
	// under GPL version 2 or later
	//
	// Copyright (C) 2006, 2009, 201, 2010 Albert Astals Cid <aacid@kde.org>
	// Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
	// Copyright (C) 2009 Ilya Gorenbein <igorenbein@finjan.com>
	//
	// To see a description of the changes please see the Changelog file that
	// came with your tarball or type make ChangeLog if you are building from git
	//
	//========================================================================

	#include <config.h>

	#ifdef USE_GCC_PRAGMAS
	#pragma implementation
	#endif

	#include <stddef.h>
	#include "Object.h"
	#include "Array.h"
	#include "Dict.h"
	#include "Decrypt.h"
	#include "Parser.h"
	#include "XRef.h"
	#include "Error.h"

	Parser::Parser(XRef xrefA, Lexer lexerA, GBool allowStreamsA) {
	xref = xrefA;
	lexer = lexerA;
	inlineImg = 0;
	allowStreams = allowStreamsA;
	lexer->getObj(&buf1);
	lexer->getObj(&buf2);
	}

	Parser::~Parser() {
	buf1.free();
	buf2.free();
	delete lexer;
	}

	Object Parser::getObj(Object obj, Guchar *fileKey,
	CryptAlgorithm encAlgorithm, int keyLength,
	int objNum, int objGen) {
	std::set<int> fetchOriginatorNums;
	return getObj(obj, fileKey, encAlgorithm, keyLength, objNum, objGen, &fetchOriginatorNums);
	}

	Object Parser::getObj(Object obj, std::set<int> *fetchOriginatorNums)
	{
	return getObj(obj, NULL, cryptRC4, 0, 0, 0, fetchOriginatorNums);
	}

	Object Parser::getObj(Object obj, Guchar *fileKey,
	CryptAlgorithm encAlgorithm, int keyLength,
	int objNum, int objGen, std::set<int> *fetchOriginatorNums) {
	char *key;
	Stream *str;
	Object obj2;
	int num;
	DecryptStream *decrypt;
	GooString s, s2;
	int c;

	// refill buffer after inline image data
	if (inlineImg == 2) {
	buf1.free();
	buf2.free();
	lexer->getObj(&buf1);
	lexer->getObj(&buf2);
	inlineImg = 0;
	}

	// array
	if (buf1.isCmd("[")) {
	shift();
	obj->initArray(xref);
	while (!buf1.isCmd("]") && !buf1.isEOF())
	obj->arrayAdd(getObj(&obj2, fileKey, encAlgorithm, keyLength,
	objNum, objGen, fetchOriginatorNums));
	if (buf1.isEOF())
	error(getPos(), "End of file inside array");
	shift();

	// dictionary or stream
	} else if (buf1.isCmd("<<")) {
	shift(objNum);
	obj->initDict(xref);
	while (!buf1.isCmd(">>") && !buf1.isEOF()) {
	if (!buf1.isName()) {
	error(getPos(), "Dictionary key must be a name object");
	shift();
	} else {
	// buf1 might go away in shift(), so construct the key
	key = copyString(buf1.getName());
	shift();
	if (buf1.isEOF() \|\| buf1.isError()) {
	gfree(key);
	break;
	}
	obj->dictAdd(key, getObj(&obj2, fileKey, encAlgorithm, keyLength, objNum, objGen, fetchOriginatorNums));
	}
	}
	if (buf1.isEOF())
	error(getPos(), "End of file inside dictionary");
	// stream objects are not allowed inside content streams or
	// object streams
	if (allowStreams && buf2.isCmd("stream")) {
	if ((str = makeStream(obj, fileKey, encAlgorithm, keyLength,
	objNum, objGen, fetchOriginatorNums))) {
	obj->initStream(str);
	} else {
	obj->free();
	obj->initError();
	}
	} else {
	shift();
	}

	// indirect reference or integer
	} else if (buf1.isInt()) {
	num = buf1.getInt();
	shift();
	if (buf1.isInt() && buf2.isCmd("R")) {
	obj->initRef(num, buf1.getInt());
	shift();
	shift();
	} else {
	obj->initInt(num);
	}

	// string
	} else if (buf1.isString() && fileKey) {
	s = buf1.getString();
	s2 = new GooString();
	obj2.initNull();
	decrypt = new DecryptStream(new MemStream(s->getCString(), 0,
	s->getLength(), &obj2),
	fileKey, encAlgorithm, keyLength,
	objNum, objGen);
	decrypt->reset();
	while ((c = decrypt->getChar()) != EOF) {
	s2->append((char)c);
	}
	delete decrypt;
	obj->initString(s2);
	shift();

	// simple object
	} else {
	// avoid re-allocating memory for complex objects like strings by
	// shallow copy of <buf1> to <obj> and nulling <buf1> so that
	// subsequent buf1.free() won't free this memory
	buf1.shallowCopy(obj);
	buf1.initNull();
	shift();
	}

	return obj;
	}

	Stream Parser::makeStream(Object dict, Guchar *fileKey,
	CryptAlgorithm encAlgorithm, int keyLength,
	int objNum, int objGen, std::set<int> *fetchOriginatorNums) {
	Object obj;
	BaseStream *baseStr;
	Stream *str;
	Guint pos, endPos, length;

	// get stream start position
	lexer->skipToNextLine();
	pos = lexer->getPos();

	// get length
	dict->dictLookup("Length", &obj, fetchOriginatorNums);
	if (obj.isInt()) {
	length = (Guint)obj.getInt();
	obj.free();
	} else {
	error(getPos(), "Bad 'Length' attribute in stream");
	obj.free();
	length = 0;
	}

	// check for length in damaged file
	if (xref && xref->getStreamEnd(pos, &endPos)) {
	length = endPos - pos;
	}

	// in badly damaged PDF files, we can run off the end of the input
	// stream immediately after the "stream" token
	if (!lexer->getStream()) {
	return NULL;
	}
	baseStr = lexer->getStream()->getBaseStream();

	// skip over stream data
	if (Lexer::LOOK_VALUE_NOT_CACHED != lexer->lookCharLastValueCached) {
	// take into account the fact that we've cached one value
	pos = pos - 1;
	lexer->lookCharLastValueCached = Lexer::LOOK_VALUE_NOT_CACHED;
	}
	lexer->setPos(pos + length);

	// refill token buffers and check for 'endstream'
	shift(); // kill '>>'
	shift(); // kill 'stream'
	if (buf1.isCmd("endstream")) {
	shift();
	} else {
	error(getPos(), "Missing 'endstream'");
	if (xref) {
	// shift until we find the proper endstream or we change to another object or reach eof
	while (!buf1.isCmd("endstream") && xref->getNumEntry(lexer->getPos()) == objNum && !buf1.isEOF()) {
	shift();
	}
	length = lexer->getPos() - pos;
	if (buf1.isCmd("endstream")) {
	obj.initInt(length);
	dict->dictSet("Length", &obj);
	obj.free();
	}
	} else {
	// When building the xref we can't use it so use this
	// kludge for broken PDF files: just add 5k to the length, and
	// hope its enough
	length += 5000;
	}
	}

	// make base stream
	str = baseStr->makeSubStream(pos, gTrue, length, dict);

	// handle decryption
	if (fileKey) {
	str = new DecryptStream(str, fileKey, encAlgorithm, keyLength,
	objNum, objGen);
	}

	// get filters
	str = str->addFilters(dict);

	return str;
	}

	void Parser::shift(int objNum) {
	if (inlineImg > 0) {
	if (inlineImg < 2) {
	++inlineImg;
	} else {
	// in a damaged content stream, if 'ID' shows up in the middle
	// of a dictionary, we need to reset
	inlineImg = 0;
	}
	} else if (buf2.isCmd("ID")) {
	lexer->skipChar(); // skip char after 'ID' command
	inlineImg = 1;
	}
	buf1.free();
	buf2.shallowCopy(&buf1);
	if (inlineImg > 0) // don't buffer inline image data
	buf2.initNull();
	else
	lexer->getObj(&buf2, objNum);
	}