diff --git a/examples/node/domparsermock.js b/examples/node/domparsermock.js
deleted file mode 100644
index 1dde248c3..000000000
--- a/examples/node/domparsermock.js
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Any copyright is dedicated to the Public Domain.
- * http://creativecommons.org/publicdomain/zero/1.0/ */
-
-// Dummy XML Parser
-
-function DOMNodeMock(nodeName, nodeValue) {
- this.nodeName = nodeName;
- this.nodeValue = nodeValue;
- Object.defineProperty(this, 'parentNode', {value: null, writable: true});
-}
-DOMNodeMock.prototype = {
- get firstChild() {
- return this.childNodes[0];
- },
- get nextSibling() {
- var index = this.parentNode.childNodes.indexOf(this);
- return this.parentNode.childNodes[index + 1];
- },
- get textContent() {
- if (!this.childNodes) {
- return this.nodeValue || '';
- }
- return this.childNodes.map(function (child) {
- return child.textContent;
- }).join('');
- },
- hasChildNodes: function () {
- return this.childNodes && this.childNodes.length > 0;
- }
-};
-
-function decodeXML(text) {
- if (text.indexOf('&') < 0) {
- return text;
- }
- return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi, function (all, entityName, number) {
- if (number) {
- return String.fromCharCode(number[0] === 'x' ? parseInt(number.substring(1), 16) : +number);
- }
- switch (entityName) {
- case 'amp':
- return '&';
- case 'lt':
- return '<';
- case 'gt':
- return '>';
- case 'quot':
- return '\"';
- case 'apos':
- return '\'';
- }
- return '&' + entityName + ';';
- });
-}
-
-function DOMParserMock() {};
-DOMParserMock.prototype = {
- parseFromString: function (content) {
- content = content.replace(/<\?[\s\S]*?\?>|/g, '').trim();
- var nodes = [];
- content = content.replace(/>([\s\S]+?)<'; // ignoring whitespaces
- }
- return '>' + i + ',<';
- });
- content = content.replace(//g, function (all, text) {
- var i = nodes.length;
- var node = new DOMNodeMock('#text', text);
- nodes.push(node);
- return i + ',';
- });
- var lastLength;
- do {
- lastLength = nodes.length;
- content = content.replace(/<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g,
- function (all, name, attrs, content) {
- var i = nodes.length;
- var node = new DOMNodeMock(name);
- var children = [];
- if (content) {
- content = content.split(',');
- content.pop();
- content.forEach(function (child) {
- var childNode = nodes[+child];
- childNode.parentNode = node;
- children.push(childNode);
- })
- }
- node.childNodes = children;
- nodes.push(node);
- return i + ',';
-
- });
- } while(lastLength < nodes.length);
- return {
- documentElement: nodes.pop()
- };
- }
-};
-
-exports.DOMParserMock = DOMParserMock;
diff --git a/examples/node/getinfo.js b/examples/node/getinfo.js
index 3dce2e20a..61034cfa3 100644
--- a/examples/node/getinfo.js
+++ b/examples/node/getinfo.js
@@ -9,9 +9,6 @@
var fs = require('fs');
-// HACK adding DOMParser to read XMP metadata.
-global.DOMParser = require('./domparsermock.js').DOMParserMock;
-
// Run `gulp dist-install` to generate 'pdfjs-dist' npm package files.
var pdfjsLib = require('pdfjs-dist');
@@ -34,7 +31,7 @@ pdfjsLib.getDocument(pdfPath).then(function (doc) {
console.log();
if (data.metadata) {
console.log('## Metadata');
- console.log(JSON.stringify(data.metadata.metadata, null, 2));
+ console.log(JSON.stringify(data.metadata.getAll(), null, 2));
console.log();
}
});
diff --git a/src/display/dom_utils.js b/src/display/dom_utils.js
index 4d0116849..21cfb0425 100644
--- a/src/display/dom_utils.js
+++ b/src/display/dom_utils.js
@@ -131,6 +131,132 @@ class DOMSVGFactory {
}
}
+class SimpleDOMNode {
+ constructor(nodeName, nodeValue) {
+ this.nodeName = nodeName;
+ this.nodeValue = nodeValue;
+
+ Object.defineProperty(this, 'parentNode', { value: null, writable: true, });
+ }
+
+ get firstChild() {
+ return this.childNodes[0];
+ }
+
+ get nextSibling() {
+ let index = this.parentNode.childNodes.indexOf(this);
+ return this.parentNode.childNodes[index + 1];
+ }
+
+ get textContent() {
+ if (!this.childNodes) {
+ return this.nodeValue || '';
+ }
+ return this.childNodes.map(function(child) {
+ return child.textContent;
+ }).join('');
+ }
+
+ hasChildNodes() {
+ return this.childNodes && this.childNodes.length > 0;
+ }
+}
+
+class SimpleXMLParser {
+ parseFromString(data) {
+ let nodes = [];
+
+ // Remove all comments and processing instructions.
+ data = data.replace(/<\?[\s\S]*?\?>|/g, '').trim();
+ data = data.replace(/\[]+(\[[^\]]+)?[^>]+>/g, '').trim();
+
+ // Extract all text nodes and replace them with a numeric index in
+ // the nodes.
+ data = data.replace(/>([^<][\s\S]*?) {
+ let length = nodes.length;
+ let node = new SimpleDOMNode('#text', this._decodeXML(text));
+ nodes.push(node);
+ if (node.textContent.trim().length === 0) {
+ return '><'; // Ignore whitespace.
+ }
+ return '>' + length + ',<';
+ });
+
+ // Extract all CDATA nodes.
+ data = data.replace(//g,
+ function(all, text) {
+ let length = nodes.length;
+ let node = new SimpleDOMNode('#text', text);
+ nodes.push(node);
+ return length + ',';
+ });
+
+ // Until nodes without '<' and '>' content are present, replace them
+ // with a numeric index in the nodes.
+ let regex =
+ /<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g;
+ let lastLength;
+ do {
+ lastLength = nodes.length;
+ data = data.replace(regex, function(all, name, attrs, data) {
+ let length = nodes.length;
+ let node = new SimpleDOMNode(name);
+ let children = [];
+ if (data) {
+ data = data.split(',');
+ data.pop();
+ data.forEach(function(child) {
+ let childNode = nodes[+child];
+ childNode.parentNode = node;
+ children.push(childNode);
+ });
+ }
+
+ node.childNodes = children;
+ nodes.push(node);
+ return length + ',';
+ });
+ } while (lastLength < nodes.length);
+
+ // We should only have one root index left, which will be last in the nodes.
+ return {
+ documentElement: nodes.pop(),
+ };
+ }
+
+ _decodeXML(text) {
+ if (text.indexOf('&') < 0) {
+ return text;
+ }
+
+ return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi,
+ function(all, entityName, number) {
+ if (number) {
+ if (number[0] === 'x') {
+ number = parseInt(number.substring(1), 16);
+ } else {
+ number = +number;
+ }
+ return String.fromCharCode(number);
+ }
+
+ switch (entityName) {
+ case 'amp':
+ return '&';
+ case 'lt':
+ return '<';
+ case 'gt':
+ return '>';
+ case 'quot':
+ return '\"';
+ case 'apos':
+ return '\'';
+ }
+ return '&' + entityName + ';';
+ });
+ }
+}
+
/**
* Optimised CSS custom property getter/setter.
* @class
@@ -353,4 +479,5 @@ export {
DOMCanvasFactory,
DOMCMapReaderFactory,
DOMSVGFactory,
+ SimpleXMLParser,
};
diff --git a/src/display/metadata.js b/src/display/metadata.js
index 0b2fe2a20..7878ec838 100644
--- a/src/display/metadata.js
+++ b/src/display/metadata.js
@@ -13,18 +13,19 @@
* limitations under the License.
*/
+import { assert, deprecated } from '../shared/util';
+import { SimpleXMLParser } from './dom_utils';
+
class Metadata {
constructor(data) {
- if (typeof data === 'string') {
- // Ghostscript may produce invalid metadata, so try to repair that first.
- data = this._repair(data);
-
- // Convert the string to a DOM `Document`.
- let parser = new DOMParser();
- data = parser.parseFromString(data, 'application/xml');
- } else if (!(data instanceof Document)) {
- throw new Error('Metadata: input is not a string or `Document`');
- }
+ assert(typeof data === 'string', 'Metadata: input is not a string');
+
+ // Ghostscript may produce invalid metadata, so try to repair that first.
+ data = this._repair(data);
+
+ // Convert the string to a DOM `Document`.
+ let parser = new SimpleXMLParser();
+ data = parser.parseFromString(data);
this._metadata = Object.create(null);
@@ -90,9 +91,18 @@ class Metadata {
return this._metadata[name] || null;
}
+ getAll() {
+ return this._metadata;
+ }
+
has(name) {
return typeof this._metadata[name] !== 'undefined';
}
+
+ get metadata() {
+ deprecated('`metadata` getter; use `getAll()` instead.');
+ return this.getAll();
+ }
}
export {
diff --git a/test/unit/metadata_spec.js b/test/unit/metadata_spec.js
index 548bf4318..f7fa947fa 100644
--- a/test/unit/metadata_spec.js
+++ b/test/unit/metadata_spec.js
@@ -16,15 +16,37 @@
import { Metadata } from '../../src/display/metadata';
describe('metadata', function() {
- describe('incorrect_xmp', function() {
- it('should fix the incorrect XMP data', function() {
- var invalidXMP = '' +
- '' +
- '' +
- '\\376\\377\\000P\\000D\\000F\\000&' +
- '';
- var meta = new Metadata(invalidXMP);
- expect(meta.get('dc:title')).toEqual('PDF&');
- });
+ it('should handle valid metadata', function() {
+ var validData = '' +
+ '' +
+ '' +
+ 'Foo bar baz' +
+ '';
+ var metadata = new Metadata(validData);
+
+ expect(metadata.has('dc:title')).toBeTruthy();
+ expect(metadata.has('dc:qux')).toBeFalsy();
+
+ expect(metadata.get('dc:title')).toEqual('Foo bar baz');
+ expect(metadata.get('dc:qux')).toEqual(null);
+
+ expect(metadata.getAll()).toEqual({ 'dc:title': 'Foo bar baz', });
+ });
+
+ it('should repair and handle invalid metadata', function() {
+ var invalidData = '' +
+ '' +
+ '' +
+ '\\376\\377\\000P\\000D\\000F\\000&' +
+ '';
+ var metadata = new Metadata(invalidData);
+
+ expect(metadata.has('dc:title')).toBeTruthy();
+ expect(metadata.has('dc:qux')).toBeFalsy();
+
+ expect(metadata.get('dc:title')).toEqual('PDF&');
+ expect(metadata.get('dc:qux')).toEqual(null);
+
+ expect(metadata.getAll()).toEqual({ 'dc:title': 'PDF&', });
});
});