diff --git a/src/core/evaluator.js b/src/core/evaluator.js index edc988b96..ba2ee9ef5 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1757,6 +1757,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { properties.differences = differences; properties.baseEncodingName = baseEncodingName; + properties.hasEncoding = !!baseEncodingName || differences.length > 0; properties.dict = dict; return toUnicodePromise.then(function(toUnicode) { properties.toUnicode = toUnicode; @@ -1774,8 +1775,10 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { * {ToUnicodeMap|IdentityToUnicodeMap} object. */ buildToUnicode: function PartialEvaluator_buildToUnicode(properties) { + properties.hasIncludedToUnicodeMap = + !!properties.toUnicode && properties.toUnicode.length > 0; // Section 9.10.2 Mapping Character Codes to Unicode Values - if (properties.toUnicode && properties.toUnicode.length !== 0) { + if (properties.hasIncludedToUnicodeMap) { return Promise.resolve(properties.toUnicode); } // According to the spec if the font is a simple font we should only map diff --git a/src/core/fonts.js b/src/core/fonts.js index ae9ced291..6d38a0920 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -163,6 +163,30 @@ function adjustWidths(properties) { properties.defaultWidth *= scale; } +function adjustToUnicode(properties, builtInEncoding) { + if (properties.hasIncludedToUnicodeMap) { + return; // The font dictionary has a `ToUnicode` entry. + } + if (properties.hasEncoding) { + return; // The font dictionary has an `Encoding` entry. + } + if (builtInEncoding === properties.defaultEncoding) { + return; // No point in trying to adjust `toUnicode` if the encodings match. + } + if (properties.toUnicode instanceof IdentityToUnicodeMap) { + return; + } + var toUnicode = [], glyphsUnicodeMap = getGlyphsUnicode(); + for (var charCode in builtInEncoding) { + var glyphName = builtInEncoding[charCode]; + var unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); + if (unicode !== -1) { + toUnicode[charCode] = String.fromCharCode(unicode); + } + } + properties.toUnicode.amend(toUnicode); +} + function getFontType(type, subtype) { switch (type) { case 'Type1': @@ -261,7 +285,13 @@ var ToUnicodeMap = (function ToUnicodeMapClosure() { charCodeOf: function(v) { return this._map.indexOf(v); - } + }, + + amend: function (map) { + for (var charCode in map) { + this._map[charCode] = map[charCode]; + } + }, }; return ToUnicodeMap; @@ -297,7 +327,11 @@ var IdentityToUnicodeMap = (function IdentityToUnicodeMapClosure() { charCodeOf: function (v) { return (isInt(v) && v >= this.firstChar && v <= this.lastChar) ? v : -1; - } + }, + + amend: function (map) { + error('Should not call amend()'); + }, }; return IdentityToUnicodeMap; @@ -765,6 +799,7 @@ var Font = (function FontClosure() { this.fontMatrix = properties.fontMatrix; this.widths = properties.widths; this.defaultWidth = properties.defaultWidth; + this.toUnicode = properties.toUnicode; this.encoding = properties.baseEncoding; this.seacMap = properties.seacMap; @@ -2386,10 +2421,8 @@ var Font = (function FontClosure() { } else { // Most of the following logic in this code branch is based on the // 9.6.6.4 of the PDF spec. - var hasEncoding = - properties.differences.length > 0 || !!properties.baseEncodingName; - var cmapTable = - readCmapTable(tables['cmap'], font, this.isSymbolicFont, hasEncoding); + var cmapTable = readCmapTable(tables['cmap'], font, this.isSymbolicFont, + properties.hasEncoding); var cmapPlatformId = cmapTable.platformId; var cmapEncodingId = cmapTable.encodingId; var cmapMappings = cmapTable.mappings; @@ -2398,7 +2431,7 @@ var Font = (function FontClosure() { // The spec seems to imply that if the font is symbolic the encoding // should be ignored, this doesn't appear to work for 'preistabelle.pdf' // where the the font is symbolic and it has an encoding. - if (hasEncoding && + if (properties.hasEncoding && (cmapPlatformId === 3 && cmapEncodingId === 1 || cmapPlatformId === 1 && cmapEncodingId === 0) || (cmapPlatformId === -1 && cmapEncodingId === -1 && // Temporary hack @@ -2562,6 +2595,12 @@ var Font = (function FontClosure() { // TODO: Check the charstring widths to determine this. properties.fixedPitch = false; + if (properties.builtInEncoding) { + // For Type1 fonts that do not include either `ToUnicode` or `Encoding` + // data, attempt to use the `builtInEncoding` to improve text selection. + adjustToUnicode(properties, properties.builtInEncoding); + } + var mapping = font.getGlyphMapping(properties); var newMapping = adjustMapping(mapping, properties); this.toFontChar = newMapping.toFontChar; diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 546382bea..0e2e4b5a1 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -22,6 +22,7 @@ !issue5808.pdf !issue6204.pdf !issue6782.pdf +!issue6901.pdf !issue6961.pdf !issue6962.pdf !issue7020.pdf diff --git a/test/pdfs/issue6901.pdf b/test/pdfs/issue6901.pdf new file mode 100644 index 000000000..90ec41783 Binary files /dev/null and b/test/pdfs/issue6901.pdf differ diff --git a/test/test_manifest.json b/test/test_manifest.json index bc8b0412c..6ffefb1a2 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -1220,6 +1220,20 @@ "link": false, "type": "text" }, + { "id": "issue6901-eq", + "file": "pdfs/issue6901.pdf", + "md5": "1a0604b1a7a3aaf2162b425a9a84230b", + "rounds": 1, + "link": false, + "type": "eq" + }, + { "id": "issue6901-text", + "file": "pdfs/issue6901.pdf", + "md5": "1a0604b1a7a3aaf2162b425a9a84230b", + "rounds": 1, + "link": false, + "type": "text" + }, { "id": "issue6962", "file": "pdfs/issue6962.pdf", "md5": "d40e871ecca68baf93114bd28c782148",