You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1116 lines
34 KiB
1116 lines
34 KiB
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */ |
|
/* Copyright 2012 Mozilla Foundation |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
/* globals assertWellFormed, bytesToString, CipherTransformFactory, error, info, |
|
InvalidPDFException, isArray, isCmd, isDict, isInt, isName, isRef, |
|
isStream, JpegStream, Lexer, log, Page, Parser, Promise, shadow, |
|
stringToPDFString, stringToUTF8String, warn, isString, assert, PDFJS, |
|
MissingDataException, XRefParseException, Stream */ |
|
|
|
'use strict'; |
|
|
|
var Name = (function NameClosure() { |
|
function Name(name) { |
|
this.name = name; |
|
} |
|
|
|
Name.prototype = {}; |
|
|
|
return Name; |
|
})(); |
|
|
|
var Cmd = (function CmdClosure() { |
|
function Cmd(cmd) { |
|
this.cmd = cmd; |
|
} |
|
|
|
Cmd.prototype = {}; |
|
|
|
var cmdCache = {}; |
|
|
|
Cmd.get = function Cmd_get(cmd) { |
|
var cmdValue = cmdCache[cmd]; |
|
if (cmdValue) |
|
return cmdValue; |
|
|
|
return cmdCache[cmd] = new Cmd(cmd); |
|
}; |
|
|
|
return Cmd; |
|
})(); |
|
|
|
var Dict = (function DictClosure() { |
|
var nonSerializable = function nonSerializableClosure() { |
|
return nonSerializable; // creating closure on some variable |
|
}; |
|
|
|
// xref is optional |
|
function Dict(xref) { |
|
// Map should only be used internally, use functions below to access. |
|
this.map = Object.create(null); |
|
this.xref = xref; |
|
this.__nonSerializable__ = nonSerializable; // disable cloning of the Dict |
|
} |
|
|
|
Dict.prototype = { |
|
assignXref: function Dict_assignXref(newXref) { |
|
this.xref = newXref; |
|
}, |
|
|
|
// automatically dereferences Ref objects |
|
get: function Dict_get(key1, key2, key3) { |
|
var value; |
|
var xref = this.xref; |
|
if (typeof (value = this.map[key1]) != 'undefined' || key1 in this.map || |
|
typeof key2 == 'undefined') { |
|
return xref ? xref.fetchIfRef(value) : value; |
|
} |
|
if (typeof (value = this.map[key2]) != 'undefined' || key2 in this.map || |
|
typeof key3 == 'undefined') { |
|
return xref ? xref.fetchIfRef(value) : value; |
|
} |
|
value = this.map[key3] || null; |
|
return xref ? xref.fetchIfRef(value) : value; |
|
}, |
|
|
|
// no dereferencing |
|
getRaw: function Dict_getRaw(key) { |
|
return this.map[key]; |
|
}, |
|
|
|
// creates new map and dereferences all Refs |
|
getAll: function Dict_getAll() { |
|
var all = {}; |
|
for (var key in this.map) { |
|
var obj = this.get(key); |
|
all[key] = obj instanceof Dict ? obj.getAll() : obj; |
|
} |
|
return all; |
|
}, |
|
|
|
set: function Dict_set(key, value) { |
|
this.map[key] = value; |
|
}, |
|
|
|
has: function Dict_has(key) { |
|
return key in this.map; |
|
}, |
|
|
|
forEach: function Dict_forEach(callback) { |
|
for (var key in this.map) { |
|
callback(key, this.get(key)); |
|
} |
|
} |
|
}; |
|
|
|
return Dict; |
|
})(); |
|
|
|
var Ref = (function RefClosure() { |
|
function Ref(num, gen) { |
|
this.num = num; |
|
this.gen = gen; |
|
} |
|
|
|
Ref.prototype = {}; |
|
|
|
return Ref; |
|
})(); |
|
|
|
// The reference is identified by number and generation, |
|
// this structure stores only one instance of the reference. |
|
var RefSet = (function RefSetClosure() { |
|
function RefSet() { |
|
this.dict = {}; |
|
} |
|
|
|
RefSet.prototype = { |
|
has: function RefSet_has(ref) { |
|
return !!this.dict['R' + ref.num + '.' + ref.gen]; |
|
}, |
|
|
|
put: function RefSet_put(ref) { |
|
this.dict['R' + ref.num + '.' + ref.gen] = ref; |
|
} |
|
}; |
|
|
|
return RefSet; |
|
})(); |
|
|
|
var Catalog = (function CatalogClosure() { |
|
function Catalog(pdfManager, xref) { |
|
this.pdfManager = pdfManager; |
|
this.xref = xref; |
|
this.catDict = xref.getCatalogObj(); |
|
assertWellFormed(isDict(this.catDict), |
|
'catalog object is not a dictionary'); |
|
|
|
// Stores state as we traverse the pages catalog so that we can resume |
|
// parsing if an exception is thrown |
|
this.traversePagesQueue = [{ |
|
pagesDict: this.toplevelPagesDict, |
|
posInKids: 0 |
|
}]; |
|
this.pagePromises = []; |
|
this.currPageIndex = 0; |
|
} |
|
|
|
Catalog.prototype = { |
|
get metadata() { |
|
var streamRef = this.catDict.getRaw('Metadata'); |
|
if (!isRef(streamRef)) |
|
return shadow(this, 'metadata', null); |
|
|
|
var encryptMetadata = !this.xref.encrypt ? false : |
|
this.xref.encrypt.encryptMetadata; |
|
|
|
var stream = this.xref.fetch(streamRef, !encryptMetadata); |
|
var metadata; |
|
if (stream && isDict(stream.dict)) { |
|
var type = stream.dict.get('Type'); |
|
var subtype = stream.dict.get('Subtype'); |
|
|
|
if (isName(type) && isName(subtype) && |
|
type.name === 'Metadata' && subtype.name === 'XML') { |
|
// XXX: This should examine the charset the XML document defines, |
|
// however since there are currently no real means to decode |
|
// arbitrary charsets, let's just hope that the author of the PDF |
|
// was reasonable enough to stick with the XML default charset, |
|
// which is UTF-8. |
|
try { |
|
metadata = stringToUTF8String(bytesToString(stream.getBytes())); |
|
} catch (e) { |
|
info('Skipping invalid metadata.'); |
|
} |
|
} |
|
} |
|
|
|
return shadow(this, 'metadata', metadata); |
|
}, |
|
get toplevelPagesDict() { |
|
var pagesObj = this.catDict.get('Pages'); |
|
assertWellFormed(isDict(pagesObj), 'invalid top-level pages dictionary'); |
|
// shadow the prototype getter |
|
return shadow(this, 'toplevelPagesDict', pagesObj); |
|
}, |
|
get documentOutline() { |
|
var xref = this.xref; |
|
var obj = this.catDict.get('Outlines'); |
|
var root = { items: [] }; |
|
if (isDict(obj)) { |
|
obj = obj.getRaw('First'); |
|
var processed = new RefSet(); |
|
if (isRef(obj)) { |
|
var queue = [{obj: obj, parent: root}]; |
|
// to avoid recursion keeping track of the items |
|
// in the processed dictionary |
|
processed.put(obj); |
|
while (queue.length > 0) { |
|
var i = queue.shift(); |
|
var outlineDict = xref.fetchIfRef(i.obj); |
|
if (outlineDict === null) |
|
continue; |
|
if (!outlineDict.has('Title')) |
|
error('Invalid outline item'); |
|
var dest = outlineDict.get('A'); |
|
if (dest) |
|
dest = dest.get('D'); |
|
else if (outlineDict.has('Dest')) { |
|
dest = outlineDict.getRaw('Dest'); |
|
if (isName(dest)) |
|
dest = dest.name; |
|
} |
|
var title = outlineDict.get('Title'); |
|
var outlineItem = { |
|
dest: dest, |
|
title: stringToPDFString(title), |
|
color: outlineDict.get('C') || [0, 0, 0], |
|
count: outlineDict.get('Count'), |
|
bold: !!(outlineDict.get('F') & 2), |
|
italic: !!(outlineDict.get('F') & 1), |
|
items: [] |
|
}; |
|
i.parent.items.push(outlineItem); |
|
obj = outlineDict.getRaw('First'); |
|
if (isRef(obj) && !processed.has(obj)) { |
|
queue.push({obj: obj, parent: outlineItem}); |
|
processed.put(obj); |
|
} |
|
obj = outlineDict.getRaw('Next'); |
|
if (isRef(obj) && !processed.has(obj)) { |
|
queue.push({obj: obj, parent: i.parent}); |
|
processed.put(obj); |
|
} |
|
} |
|
} |
|
} |
|
obj = root.items.length > 0 ? root.items : null; |
|
return shadow(this, 'documentOutline', obj); |
|
}, |
|
get numPages() { |
|
var obj = this.toplevelPagesDict.get('Count'); |
|
assertWellFormed( |
|
isInt(obj), |
|
'page count in top level pages object is not an integer' |
|
); |
|
// shadow the prototype getter |
|
return shadow(this, 'num', obj); |
|
}, |
|
get destinations() { |
|
function fetchDestination(dest) { |
|
return isDict(dest) ? dest.get('D') : dest; |
|
} |
|
|
|
var xref = this.xref; |
|
var dests = {}, nameTreeRef, nameDictionaryRef; |
|
var obj = this.catDict.get('Names'); |
|
if (obj) |
|
nameTreeRef = obj.getRaw('Dests'); |
|
else if (this.catDict.has('Dests')) |
|
nameDictionaryRef = this.catDict.get('Dests'); |
|
|
|
if (nameDictionaryRef) { |
|
// reading simple destination dictionary |
|
obj = nameDictionaryRef; |
|
obj.forEach(function catalogForEach(key, value) { |
|
if (!value) return; |
|
dests[key] = fetchDestination(value); |
|
}); |
|
} |
|
if (nameTreeRef) { |
|
var nameTree = new NameTree(nameTreeRef, xref); |
|
var names = nameTree.getAll(); |
|
for (var name in names) { |
|
if (!names.hasOwnProperty(name)) { |
|
continue; |
|
} |
|
dests[name] = fetchDestination(names[name]); |
|
} |
|
} |
|
return shadow(this, 'destinations', dests); |
|
}, |
|
get javaScript() { |
|
var xref = this.xref; |
|
var obj = this.catDict.get('Names'); |
|
|
|
var javaScript = []; |
|
if (obj && obj.has('JavaScript')) { |
|
var nameTree = new NameTree(obj.getRaw('JavaScript'), xref); |
|
var names = nameTree.getAll(); |
|
for (var name in names) { |
|
if (!names.hasOwnProperty(name)) { |
|
continue; |
|
} |
|
// We don't really use the JavaScript right now so this code is |
|
// defensive so we don't cause errors on document load. |
|
var jsDict = names[name]; |
|
if (!isDict(jsDict)) { |
|
continue; |
|
} |
|
var type = jsDict.get('S'); |
|
if (!isName(type) || type.name !== 'JavaScript') { |
|
continue; |
|
} |
|
var js = jsDict.get('JS'); |
|
if (!isString(js) && !isStream(js)) { |
|
continue; |
|
} |
|
if (isStream(js)) { |
|
js = bytesToString(js.getBytes()); |
|
} |
|
javaScript.push(stringToPDFString(js)); |
|
} |
|
} |
|
return shadow(this, 'javaScript', javaScript); |
|
}, |
|
|
|
getPage: function Catalog_getPage(pageIndex) { |
|
if (!(pageIndex in this.pagePromises)) { |
|
this.pagePromises[pageIndex] = new PDFJS.Promise(); |
|
} |
|
return this.pagePromises[pageIndex]; |
|
}, |
|
|
|
// Traverses pages in DFS order so that pages are processed in increasing |
|
// order |
|
traversePages: function Catalog_traversePages() { |
|
var queue = this.traversePagesQueue; |
|
while (queue.length) { |
|
var queueItem = queue[queue.length - 1]; |
|
var pagesDict = queueItem.pagesDict; |
|
|
|
var kids = pagesDict.get('Kids'); |
|
assert(isArray(kids), 'page dictionary kids object is not an array'); |
|
if (queueItem.posInKids >= kids.length) { |
|
queue.pop(); |
|
continue; |
|
} |
|
var kidRef = kids[queueItem.posInKids]; |
|
assert(isRef(kidRef), 'page dictionary kid is not a reference'); |
|
|
|
var kid = this.xref.fetch(kidRef); |
|
if (isDict(kid, 'Page') || (isDict(kid) && !kid.has('Kids'))) { |
|
var pageIndex = this.currPageIndex++; |
|
var page = new Page(this.pdfManager, this.xref, pageIndex, kid, |
|
kidRef); |
|
if (!(pageIndex in this.pagePromises)) { |
|
this.pagePromises[pageIndex] = new PDFJS.Promise(); |
|
} |
|
this.pagePromises[pageIndex].resolve(page); |
|
|
|
} else { // must be a child page dictionary |
|
assert( |
|
isDict(kid), |
|
'page dictionary kid reference points to wrong type of object' |
|
); |
|
|
|
queue.push({ |
|
pagesDict: kid, |
|
posInKids: 0 |
|
}); |
|
} |
|
++queueItem.posInKids; |
|
} |
|
} |
|
}; |
|
|
|
return Catalog; |
|
})(); |
|
|
|
var XRef = (function XRefClosure() { |
|
function XRef(stream, password) { |
|
|
|
this.stream = stream; |
|
this.entries = []; |
|
this.xrefstms = {}; |
|
// prepare the XRef cache |
|
this.cache = []; |
|
this.password = password; |
|
} |
|
|
|
XRef.prototype = { |
|
setStartXRef: function XRef_setStartXRef(startXRef) { |
|
// Store the starting positions of xref tables as we process them |
|
// so we can recover from missing data errors |
|
this.startXRefQueue = [startXRef]; |
|
}, |
|
|
|
parse: function XRef_parse(recoveryMode) { |
|
var trailerDict; |
|
if (!recoveryMode) { |
|
trailerDict = this.readXRef(); |
|
} else { |
|
warn('Indexing all PDF objects'); |
|
trailerDict = this.indexObjects(); |
|
} |
|
trailerDict.assignXref(this); |
|
this.trailer = trailerDict; |
|
var encrypt = trailerDict.get('Encrypt'); |
|
if (encrypt) { |
|
var ids = trailerDict.get('ID'); |
|
var fileId = (ids && ids.length) ? ids[0] : ''; |
|
this.encrypt = new CipherTransformFactory( |
|
encrypt, fileId, this.password); |
|
} |
|
|
|
// get the root dictionary (catalog) object |
|
if (!(this.root = trailerDict.get('Root'))) { |
|
error('Invalid root reference'); |
|
} |
|
}, |
|
|
|
processXRefTable: function XRef_processXRefTable(parser) { |
|
if (!('tableState' in this)) { |
|
// Stores state of the table as we process it so we can resume |
|
// from middle of table in case of missing data error |
|
this.tableState = { |
|
entryNum: 0, |
|
streamPos: parser.lexer.stream.pos, |
|
parserBuf1: parser.buf1, |
|
parserBuf2: parser.buf2 |
|
}; |
|
} |
|
|
|
var obj = this.readXRefTable(parser); |
|
|
|
// Sanity check |
|
if (!isCmd(obj, 'trailer')) |
|
error('Invalid XRef table: could not find trailer dictionary'); |
|
|
|
// Read trailer dictionary, e.g. |
|
// trailer |
|
// << /Size 22 |
|
// /Root 20R |
|
// /Info 10R |
|
// /ID [ <81b14aafa313db63dbd6f981e49f94f4> ] |
|
// >> |
|
// The parser goes through the entire stream << ... >> and provides |
|
// a getter interface for the key-value table |
|
var dict = parser.getObj(); |
|
if (!isDict(dict)) |
|
error('Invalid XRef table: could not parse trailer dictionary'); |
|
|
|
delete this.tableState; |
|
|
|
return dict; |
|
}, |
|
|
|
readXRefTable: function XRef_readXRefTable(parser) { |
|
// Example of cross-reference table: |
|
// xref |
|
// 0 1 <-- subsection header (first obj #, obj count) |
|
// 0000000000 65535 f <-- actual object (offset, generation #, f/n) |
|
// 23 2 <-- subsection header ... and so on ... |
|
// 0000025518 00002 n |
|
// 0000025635 00000 n |
|
// trailer |
|
// ... |
|
|
|
var stream = parser.lexer.stream; |
|
var tableState = this.tableState; |
|
stream.pos = tableState.streamPos; |
|
parser.buf1 = tableState.parserBuf1; |
|
parser.buf2 = tableState.parserBuf2; |
|
|
|
// Outer loop is over subsection headers |
|
var obj; |
|
|
|
while (true) { |
|
if (!('firstEntryNum' in tableState) || !('entryCount' in tableState)) { |
|
if (isCmd(obj = parser.getObj(), 'trailer')) { |
|
break; |
|
} |
|
tableState.firstEntryNum = obj; |
|
tableState.entryCount = parser.getObj(); |
|
} |
|
|
|
var first = tableState.firstEntryNum; |
|
var count = tableState.entryCount; |
|
if (!isInt(first) || !isInt(count)) |
|
error('Invalid XRef table: wrong types in subsection header'); |
|
|
|
// Inner loop is over objects themselves |
|
for (var i = tableState.entryNum; i < count; i++) { |
|
tableState.streamPos = stream.pos; |
|
tableState.entryNum = i; |
|
tableState.parserBuf1 = parser.buf1; |
|
tableState.parserBuf2 = parser.buf2; |
|
|
|
var entry = {}; |
|
entry.offset = parser.getObj(); |
|
entry.gen = parser.getObj(); |
|
var type = parser.getObj(); |
|
|
|
if (isCmd(type, 'f')) |
|
entry.free = true; |
|
else if (isCmd(type, 'n')) |
|
entry.uncompressed = true; |
|
|
|
// Validate entry obj |
|
if (!isInt(entry.offset) || !isInt(entry.gen) || |
|
!(entry.free || entry.uncompressed)) { |
|
console.log(entry.offset, entry.gen, entry.free, |
|
entry.uncompressed); |
|
error('Invalid entry in XRef subsection: ' + first + ', ' + count); |
|
} |
|
|
|
if (!this.entries[i + first]) |
|
this.entries[i + first] = entry; |
|
} |
|
|
|
tableState.entryNum = 0; |
|
tableState.streamPos = stream.pos; |
|
tableState.parserBuf1 = parser.buf1; |
|
tableState.parserBuf2 = parser.buf2; |
|
delete tableState.firstEntryNum; |
|
delete tableState.entryCount; |
|
} |
|
|
|
// Sanity check: as per spec, first object must be free |
|
if (this.entries[0] && !this.entries[0].free) |
|
error('Invalid XRef table: unexpected first object'); |
|
|
|
return obj; |
|
}, |
|
|
|
processXRefStream: function XRef_processXRefStream(stream) { |
|
if (!('streamState' in this)) { |
|
// Stores state of the stream as we process it so we can resume |
|
// from middle of stream in case of missing data error |
|
var streamParameters = stream.parameters; |
|
var byteWidths = streamParameters.get('W'); |
|
var range = streamParameters.get('Index'); |
|
if (!range) { |
|
range = [0, streamParameters.get('Size')]; |
|
} |
|
|
|
this.streamState = { |
|
entryRanges: range, |
|
byteWidths: byteWidths, |
|
entryNum: 0, |
|
streamPos: stream.pos |
|
}; |
|
} |
|
this.readXRefStream(stream); |
|
delete this.streamState; |
|
|
|
return stream.parameters; |
|
}, |
|
|
|
readXRefStream: function XRef_readXRefStream(stream) { |
|
var i, j; |
|
var streamState = this.streamState; |
|
stream.pos = streamState.streamPos; |
|
|
|
var byteWidths = streamState.byteWidths; |
|
var typeFieldWidth = byteWidths[0]; |
|
var offsetFieldWidth = byteWidths[1]; |
|
var generationFieldWidth = byteWidths[2]; |
|
|
|
var entryRanges = streamState.entryRanges; |
|
while (entryRanges.length > 0) { |
|
|
|
var first = entryRanges[0]; |
|
var n = entryRanges[1]; |
|
|
|
if (!isInt(first) || !isInt(n)) |
|
error('Invalid XRef range fields: ' + first + ', ' + n); |
|
|
|
if (!isInt(typeFieldWidth) || !isInt(offsetFieldWidth) || |
|
!isInt(generationFieldWidth)) { |
|
error('Invalid XRef entry fields length: ' + first + ', ' + n); |
|
} |
|
for (i = streamState.entryNum; i < n; ++i) { |
|
streamState.entryNum = i; |
|
streamState.streamPos = stream.pos; |
|
|
|
var type = 0, offset = 0, generation = 0; |
|
for (j = 0; j < typeFieldWidth; ++j) |
|
type = (type << 8) | stream.getByte(); |
|
// if type field is absent, its default value = 1 |
|
if (typeFieldWidth === 0) |
|
type = 1; |
|
for (j = 0; j < offsetFieldWidth; ++j) |
|
offset = (offset << 8) | stream.getByte(); |
|
for (j = 0; j < generationFieldWidth; ++j) |
|
generation = (generation << 8) | stream.getByte(); |
|
var entry = {}; |
|
entry.offset = offset; |
|
entry.gen = generation; |
|
switch (type) { |
|
case 0: |
|
entry.free = true; |
|
break; |
|
case 1: |
|
entry.uncompressed = true; |
|
break; |
|
case 2: |
|
break; |
|
default: |
|
error('Invalid XRef entry type: ' + type); |
|
} |
|
if (!this.entries[first + i]) |
|
this.entries[first + i] = entry; |
|
} |
|
|
|
streamState.entryNum = 0; |
|
streamState.streamPos = stream.pos; |
|
entryRanges.splice(0, 2); |
|
} |
|
}, |
|
indexObjects: function XRef_indexObjects() { |
|
// Simple scan through the PDF content to find objects, |
|
// trailers and XRef streams. |
|
function readToken(data, offset) { |
|
var token = '', ch = data[offset]; |
|
while (ch !== 13 && ch !== 10) { |
|
if (++offset >= data.length) |
|
break; |
|
token += String.fromCharCode(ch); |
|
ch = data[offset]; |
|
} |
|
return token; |
|
} |
|
function skipUntil(data, offset, what) { |
|
var length = what.length, dataLength = data.length; |
|
var skipped = 0; |
|
// finding byte sequence |
|
while (offset < dataLength) { |
|
var i = 0; |
|
while (i < length && data[offset + i] == what[i]) |
|
++i; |
|
if (i >= length) |
|
break; // sequence found |
|
|
|
offset++; |
|
skipped++; |
|
} |
|
return skipped; |
|
} |
|
var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]); |
|
var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114, |
|
101, 102]); |
|
var endobjBytes = new Uint8Array([101, 110, 100, 111, 98, 106]); |
|
var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]); |
|
|
|
var stream = this.stream; |
|
stream.pos = 0; |
|
var buffer = stream.getBytes(); |
|
var position = stream.start, length = buffer.length; |
|
var trailers = [], xrefStms = []; |
|
var state = 0; |
|
var currentToken; |
|
while (position < length) { |
|
var ch = buffer[position]; |
|
if (ch === 32 || ch === 9 || ch === 13 || ch === 10) { |
|
++position; |
|
continue; |
|
} |
|
if (ch === 37) { // %-comment |
|
do { |
|
++position; |
|
ch = buffer[position]; |
|
} while (ch !== 13 && ch !== 10); |
|
continue; |
|
} |
|
var token = readToken(buffer, position); |
|
var m; |
|
if (token === 'xref') { |
|
position += skipUntil(buffer, position, trailerBytes); |
|
trailers.push(position); |
|
position += skipUntil(buffer, position, startxrefBytes); |
|
} else if ((m = /^(\d+)\s+(\d+)\s+obj\b/.exec(token))) { |
|
this.entries[m[1]] = { |
|
offset: position, |
|
gen: m[2] | 0, |
|
uncompressed: true |
|
}; |
|
|
|
var contentLength = skipUntil(buffer, position, endobjBytes) + 7; |
|
var content = buffer.subarray(position, position + contentLength); |
|
|
|
// checking XRef stream suspect |
|
// (it shall have '/XRef' and next char is not a letter) |
|
var xrefTagOffset = skipUntil(content, 0, xrefBytes); |
|
if (xrefTagOffset < contentLength && |
|
content[xrefTagOffset + 5] < 64) { |
|
xrefStms.push(position); |
|
this.xrefstms[position] = 1; // don't read it recursively |
|
} |
|
|
|
position += contentLength; |
|
} else |
|
position += token.length + 1; |
|
} |
|
// reading XRef streams |
|
for (var i = 0, ii = xrefStms.length; i < ii; ++i) { |
|
this.startXRefQueue.push(xrefStms[i]); |
|
this.readXRef(/* recoveryMode */ true); |
|
} |
|
// finding main trailer |
|
var dict; |
|
for (var i = 0, ii = trailers.length; i < ii; ++i) { |
|
stream.pos = trailers[i]; |
|
var parser = new Parser(new Lexer(stream), true, null); |
|
var obj = parser.getObj(); |
|
if (!isCmd(obj, 'trailer')) |
|
continue; |
|
// read the trailer dictionary |
|
if (!isDict(dict = parser.getObj())) |
|
continue; |
|
// taking the first one with 'ID' |
|
if (dict.has('ID')) |
|
return dict; |
|
} |
|
// no tailer with 'ID', taking last one (if exists) |
|
if (dict) |
|
return dict; |
|
// nothing helps |
|
// calling error() would reject worker with an UnknownErrorException. |
|
throw new InvalidPDFException('Invalid PDF structure'); |
|
}, |
|
|
|
readXRef: function XRef_readXRef(recoveryMode) { |
|
var stream = this.stream; |
|
|
|
try { |
|
while (this.startXRefQueue.length) { |
|
var startXRef = this.startXRefQueue[0]; |
|
|
|
stream.pos = startXRef; |
|
|
|
var parser = new Parser(new Lexer(stream), true, null); |
|
var obj = parser.getObj(); |
|
var dict; |
|
|
|
// Get dictionary |
|
if (isCmd(obj, 'xref')) { |
|
|
|
// Parse end-of-file XRef |
|
dict = this.processXRefTable(parser); |
|
if (!this.topDict) { |
|
this.topDict = dict; |
|
} |
|
|
|
// Recursively get other XRefs 'XRefStm', if any |
|
obj = dict.get('XRefStm'); |
|
if (isInt(obj)) { |
|
var pos = obj; |
|
// ignore previously loaded xref streams |
|
// (possible infinite recursion) |
|
if (!(pos in this.xrefstms)) { |
|
this.xrefstms[pos] = 1; |
|
this.startXRefQueue.push(pos); |
|
} |
|
} |
|
} else if (isInt(obj)) { |
|
|
|
// Parse in-stream XRef |
|
if (!isInt(parser.getObj()) || |
|
!isCmd(parser.getObj(), 'obj') || |
|
!isStream(obj = parser.getObj())) { |
|
error('Invalid XRef stream'); |
|
} |
|
dict = this.processXRefStream(obj); |
|
if (!this.topDict) { |
|
this.topDict = dict; |
|
} |
|
|
|
if (!dict) |
|
error('Failed to read XRef stream'); |
|
} |
|
|
|
// Recursively get previous dictionary, if any |
|
obj = dict.get('Prev'); |
|
if (isInt(obj)) { |
|
this.startXRefQueue.push(obj); |
|
} else if (isRef(obj)) { |
|
// The spec says Prev must not be a reference, i.e. "/Prev NNN" |
|
// This is a fallback for non-compliant PDFs, i.e. "/Prev NNN 0 R" |
|
this.startXRefQueue.push(obj.num); |
|
} |
|
|
|
this.startXRefQueue.shift(); |
|
} |
|
|
|
return this.topDict; |
|
} catch (e) { |
|
if (e instanceof MissingDataException) { |
|
throw e; |
|
} |
|
|
|
log('(while reading XRef): ' + e); |
|
} |
|
|
|
if (recoveryMode) |
|
return; |
|
throw new XRefParseException(); |
|
}, |
|
|
|
getEntry: function XRef_getEntry(i) { |
|
var e = this.entries[i]; |
|
if (e === null) |
|
return null; |
|
return e.free || !e.offset ? null : e; // returns null if entry is free |
|
}, |
|
fetchIfRef: function XRef_fetchIfRef(obj) { |
|
if (!isRef(obj)) |
|
return obj; |
|
return this.fetch(obj); |
|
}, |
|
fetch: function XRef_fetch(ref, suppressEncryption) { |
|
assertWellFormed(isRef(ref), 'ref object is not a reference'); |
|
var num = ref.num; |
|
var e; |
|
if (num in this.cache) { |
|
e = this.cache[num]; |
|
if (e instanceof Stream) { |
|
return e.makeSubStream(e.start, e.length, e.dict); |
|
} |
|
return e; |
|
} |
|
|
|
e = this.getEntry(num); |
|
|
|
// the referenced entry can be free |
|
if (e === null) |
|
return (this.cache[num] = e); |
|
|
|
var gen = ref.gen; |
|
var stream, parser; |
|
if (e.uncompressed) { |
|
if (e.gen != gen) |
|
error('inconsistent generation in XRef'); |
|
stream = this.stream.makeSubStream(e.offset); |
|
parser = new Parser(new Lexer(stream), true, this); |
|
var obj1 = parser.getObj(); |
|
var obj2 = parser.getObj(); |
|
var obj3 = parser.getObj(); |
|
if (!isInt(obj1) || obj1 != num || |
|
!isInt(obj2) || obj2 != gen || |
|
!isCmd(obj3)) { |
|
error('bad XRef entry'); |
|
} |
|
if (!isCmd(obj3, 'obj')) { |
|
// some bad pdfs use "obj1234" and really mean 1234 |
|
if (obj3.cmd.indexOf('obj') === 0) { |
|
num = parseInt(obj3.cmd.substring(3), 10); |
|
if (!isNaN(num)) |
|
return num; |
|
} |
|
error('bad XRef entry'); |
|
} |
|
if (this.encrypt && !suppressEncryption) { |
|
try { |
|
e = parser.getObj(this.encrypt.createCipherTransform(num, gen)); |
|
} catch (ex) { |
|
// almost all streams must be encrypted, but sometimes |
|
// they are not probably due to some broken generators |
|
// re-trying without encryption |
|
return this.fetch(ref, true); |
|
} |
|
} else { |
|
e = parser.getObj(); |
|
} |
|
if (!isStream(e) || e instanceof JpegStream) { |
|
this.cache[num] = e; |
|
} else if (e instanceof Stream) { |
|
e = e.makeSubStream(e.start, e.length, e.dict); |
|
this.cache[num] = e; |
|
} else if ('readBlock' in e) { |
|
e.getBytes(); |
|
e = e.makeSubStream(0, e.bufferLength, e.dict); |
|
this.cache[num] = e; |
|
} |
|
return e; |
|
} |
|
|
|
// compressed entry |
|
var tableOffset = e.offset; |
|
stream = this.fetch(new Ref(tableOffset, 0)); |
|
if (!isStream(stream)) |
|
error('bad ObjStm stream'); |
|
var first = stream.parameters.get('First'); |
|
var n = stream.parameters.get('N'); |
|
if (!isInt(first) || !isInt(n)) { |
|
error('invalid first and n parameters for ObjStm stream'); |
|
} |
|
parser = new Parser(new Lexer(stream), false, this); |
|
parser.allowStreams = true; |
|
var i, entries = [], nums = []; |
|
// read the object numbers to populate cache |
|
for (i = 0; i < n; ++i) { |
|
num = parser.getObj(); |
|
if (!isInt(num)) { |
|
error('invalid object number in the ObjStm stream: ' + num); |
|
} |
|
nums.push(num); |
|
var offset = parser.getObj(); |
|
if (!isInt(offset)) { |
|
error('invalid object offset in the ObjStm stream: ' + offset); |
|
} |
|
} |
|
// read stream objects for cache |
|
for (i = 0; i < n; ++i) { |
|
entries.push(parser.getObj()); |
|
num = nums[i]; |
|
var entry = this.entries[num]; |
|
if (entry && entry.offset === tableOffset && entry.gen === i) { |
|
this.cache[num] = entries[i]; |
|
} |
|
} |
|
e = entries[e.gen]; |
|
if (!e) { |
|
error('bad XRef entry for compressed object'); |
|
} |
|
return e; |
|
}, |
|
getCatalogObj: function XRef_getCatalogObj() { |
|
return this.root; |
|
} |
|
}; |
|
|
|
return XRef; |
|
})(); |
|
|
|
/** |
|
* A NameTree is like a Dict but has some adventagous properties, see the spec |
|
* (7.9.6) for more details. |
|
* TODO: implement all the Dict functions and make this more efficent. |
|
*/ |
|
var NameTree = (function NameTreeClosure() { |
|
function NameTree(root, xref) { |
|
this.root = root; |
|
this.xref = xref; |
|
} |
|
|
|
NameTree.prototype = { |
|
getAll: function NameTree_getAll() { |
|
var dict = {}; |
|
if (!this.root) { |
|
return dict; |
|
} |
|
var xref = this.xref; |
|
// reading name tree |
|
var processed = new RefSet(); |
|
processed.put(this.root); |
|
var queue = [this.root]; |
|
while (queue.length > 0) { |
|
var i, n; |
|
var obj = xref.fetchIfRef(queue.shift()); |
|
if (!isDict(obj)) { |
|
continue; |
|
} |
|
if (obj.has('Kids')) { |
|
var kids = obj.get('Kids'); |
|
for (i = 0, n = kids.length; i < n; i++) { |
|
var kid = kids[i]; |
|
if (processed.has(kid)) |
|
error('invalid destinations'); |
|
queue.push(kid); |
|
processed.put(kid); |
|
} |
|
continue; |
|
} |
|
var names = obj.get('Names'); |
|
if (names) { |
|
for (i = 0, n = names.length; i < n; i += 2) { |
|
dict[names[i]] = xref.fetchIfRef(names[i + 1]); |
|
} |
|
} |
|
} |
|
return dict; |
|
} |
|
}; |
|
return NameTree; |
|
})(); |
|
|
|
/** |
|
* A PDF document and page is built of many objects. E.g. there are objects |
|
* for fonts, images, rendering code and such. These objects might get processed |
|
* inside of a worker. The `PDFObjects` implements some basic functions to |
|
* manage these objects. |
|
*/ |
|
var PDFObjects = (function PDFObjectsClosure() { |
|
function PDFObjects() { |
|
this.objs = {}; |
|
} |
|
|
|
PDFObjects.prototype = { |
|
/** |
|
* Internal function. |
|
* Ensures there is an object defined for `objId`. Stores `data` on the |
|
* object *if* it is created. |
|
*/ |
|
ensureObj: function PDFObjects_ensureObj(objId, data) { |
|
if (this.objs[objId]) |
|
return this.objs[objId]; |
|
return this.objs[objId] = new Promise(objId, data); |
|
}, |
|
|
|
/** |
|
* If called *without* callback, this returns the data of `objId` but the |
|
* object needs to be resolved. If it isn't, this function throws. |
|
* |
|
* If called *with* a callback, the callback is called with the data of the |
|
* object once the object is resolved. That means, if you call this |
|
* function and the object is already resolved, the callback gets called |
|
* right away. |
|
*/ |
|
get: function PDFObjects_get(objId, callback) { |
|
// If there is a callback, then the get can be async and the object is |
|
// not required to be resolved right now |
|
if (callback) { |
|
this.ensureObj(objId).then(callback); |
|
return null; |
|
} |
|
|
|
// If there isn't a callback, the user expects to get the resolved data |
|
// directly. |
|
var obj = this.objs[objId]; |
|
|
|
// If there isn't an object yet or the object isn't resolved, then the |
|
// data isn't ready yet! |
|
if (!obj || !obj.isResolved) |
|
error('Requesting object that isn\'t resolved yet ' + objId); |
|
|
|
return obj.data; |
|
}, |
|
|
|
/** |
|
* Resolves the object `objId` with optional `data`. |
|
*/ |
|
resolve: function PDFObjects_resolve(objId, data) { |
|
var objs = this.objs; |
|
|
|
// In case there is a promise already on this object, just resolve it. |
|
if (objs[objId]) { |
|
objs[objId].resolve(data); |
|
} else { |
|
this.ensureObj(objId, data); |
|
} |
|
}, |
|
|
|
onData: function PDFObjects_onData(objId, callback) { |
|
this.ensureObj(objId).onData(callback); |
|
}, |
|
|
|
isResolved: function PDFObjects_isResolved(objId) { |
|
var objs = this.objs; |
|
if (!objs[objId]) { |
|
return false; |
|
} else { |
|
return objs[objId].isResolved; |
|
} |
|
}, |
|
|
|
hasData: function PDFObjects_hasData(objId) { |
|
var objs = this.objs; |
|
if (!objs[objId]) { |
|
return false; |
|
} else { |
|
return objs[objId].hasData; |
|
} |
|
}, |
|
|
|
/** |
|
* Returns the data of `objId` if object exists, null otherwise. |
|
*/ |
|
getData: function PDFObjects_getData(objId) { |
|
var objs = this.objs; |
|
if (!objs[objId] || !objs[objId].hasData) { |
|
return null; |
|
} else { |
|
return objs[objId].data; |
|
} |
|
}, |
|
|
|
/** |
|
* Sets the data of an object but *doesn't* resolve it. |
|
*/ |
|
setData: function PDFObjects_setData(objId, data) { |
|
// Watchout! If you call `this.ensureObj(objId, data)` you're going to |
|
// create a *resolved* promise which shouldn't be the case! |
|
this.ensureObj(objId).data = data; |
|
}, |
|
|
|
clear: function PDFObjects_clear() { |
|
this.objs = {}; |
|
} |
|
}; |
|
return PDFObjects; |
|
})(); |
|
|
|
|