Browse Source

For embedded Type1 fonts without included `ToUnicode`/`Encoding` data, attempt to improve text selection by using the `builtInEncoding` to amend the `toUnicode` map (issue 6901, issue 7182, issue 7217, bug 917796, bug 1242142)

Note that in order to prevent any possible issues, this patch does *not* try to amend the `toUnicode` data for Type1 fonts that contain either `ToUnicode` or `Encoding` entries in the font dictionary.

Fixes, or at least improves, issues/bugs such as e.g. 6658, 6901, 7182, 7217, bug 917796, bug 1242142.
Jonas Jenwald 9 years ago
parent
commit
325f7afcca
  1. 5
      src/core/evaluator.js
  2. 51
      src/core/fonts.js
  3. 1
      test/pdfs/.gitignore
  4. BIN
      test/pdfs/issue6901.pdf
  5. 14
      test/test_manifest.json

5
src/core/evaluator.js

@ -1757,6 +1757,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
properties.differences = differences; properties.differences = differences;
properties.baseEncodingName = baseEncodingName; properties.baseEncodingName = baseEncodingName;
properties.hasEncoding = !!baseEncodingName || differences.length > 0;
properties.dict = dict; properties.dict = dict;
return toUnicodePromise.then(function(toUnicode) { return toUnicodePromise.then(function(toUnicode) {
properties.toUnicode = toUnicode; properties.toUnicode = toUnicode;
@ -1774,8 +1775,10 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
* {ToUnicodeMap|IdentityToUnicodeMap} object. * {ToUnicodeMap|IdentityToUnicodeMap} object.
*/ */
buildToUnicode: function PartialEvaluator_buildToUnicode(properties) { buildToUnicode: function PartialEvaluator_buildToUnicode(properties) {
properties.hasIncludedToUnicodeMap =
!!properties.toUnicode && properties.toUnicode.length > 0;
// Section 9.10.2 Mapping Character Codes to Unicode Values // Section 9.10.2 Mapping Character Codes to Unicode Values
if (properties.toUnicode && properties.toUnicode.length !== 0) { if (properties.hasIncludedToUnicodeMap) {
return Promise.resolve(properties.toUnicode); return Promise.resolve(properties.toUnicode);
} }
// According to the spec if the font is a simple font we should only map // According to the spec if the font is a simple font we should only map

51
src/core/fonts.js

@ -163,6 +163,30 @@ function adjustWidths(properties) {
properties.defaultWidth *= scale; properties.defaultWidth *= scale;
} }
function adjustToUnicode(properties, builtInEncoding) {
if (properties.hasIncludedToUnicodeMap) {
return; // The font dictionary has a `ToUnicode` entry.
}
if (properties.hasEncoding) {
return; // The font dictionary has an `Encoding` entry.
}
if (builtInEncoding === properties.defaultEncoding) {
return; // No point in trying to adjust `toUnicode` if the encodings match.
}
if (properties.toUnicode instanceof IdentityToUnicodeMap) {
return;
}
var toUnicode = [], glyphsUnicodeMap = getGlyphsUnicode();
for (var charCode in builtInEncoding) {
var glyphName = builtInEncoding[charCode];
var unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
if (unicode !== -1) {
toUnicode[charCode] = String.fromCharCode(unicode);
}
}
properties.toUnicode.amend(toUnicode);
}
function getFontType(type, subtype) { function getFontType(type, subtype) {
switch (type) { switch (type) {
case 'Type1': case 'Type1':
@ -261,7 +285,13 @@ var ToUnicodeMap = (function ToUnicodeMapClosure() {
charCodeOf: function(v) { charCodeOf: function(v) {
return this._map.indexOf(v); return this._map.indexOf(v);
},
amend: function (map) {
for (var charCode in map) {
this._map[charCode] = map[charCode];
} }
},
}; };
return ToUnicodeMap; return ToUnicodeMap;
@ -297,7 +327,11 @@ var IdentityToUnicodeMap = (function IdentityToUnicodeMapClosure() {
charCodeOf: function (v) { charCodeOf: function (v) {
return (isInt(v) && v >= this.firstChar && v <= this.lastChar) ? v : -1; return (isInt(v) && v >= this.firstChar && v <= this.lastChar) ? v : -1;
} },
amend: function (map) {
error('Should not call amend()');
},
}; };
return IdentityToUnicodeMap; return IdentityToUnicodeMap;
@ -765,6 +799,7 @@ var Font = (function FontClosure() {
this.fontMatrix = properties.fontMatrix; this.fontMatrix = properties.fontMatrix;
this.widths = properties.widths; this.widths = properties.widths;
this.defaultWidth = properties.defaultWidth; this.defaultWidth = properties.defaultWidth;
this.toUnicode = properties.toUnicode;
this.encoding = properties.baseEncoding; this.encoding = properties.baseEncoding;
this.seacMap = properties.seacMap; this.seacMap = properties.seacMap;
@ -2386,10 +2421,8 @@ var Font = (function FontClosure() {
} else { } else {
// Most of the following logic in this code branch is based on the // Most of the following logic in this code branch is based on the
// 9.6.6.4 of the PDF spec. // 9.6.6.4 of the PDF spec.
var hasEncoding = var cmapTable = readCmapTable(tables['cmap'], font, this.isSymbolicFont,
properties.differences.length > 0 || !!properties.baseEncodingName; properties.hasEncoding);
var cmapTable =
readCmapTable(tables['cmap'], font, this.isSymbolicFont, hasEncoding);
var cmapPlatformId = cmapTable.platformId; var cmapPlatformId = cmapTable.platformId;
var cmapEncodingId = cmapTable.encodingId; var cmapEncodingId = cmapTable.encodingId;
var cmapMappings = cmapTable.mappings; var cmapMappings = cmapTable.mappings;
@ -2398,7 +2431,7 @@ var Font = (function FontClosure() {
// The spec seems to imply that if the font is symbolic the encoding // The spec seems to imply that if the font is symbolic the encoding
// should be ignored, this doesn't appear to work for 'preistabelle.pdf' // should be ignored, this doesn't appear to work for 'preistabelle.pdf'
// where the the font is symbolic and it has an encoding. // where the the font is symbolic and it has an encoding.
if (hasEncoding && if (properties.hasEncoding &&
(cmapPlatformId === 3 && cmapEncodingId === 1 || (cmapPlatformId === 3 && cmapEncodingId === 1 ||
cmapPlatformId === 1 && cmapEncodingId === 0) || cmapPlatformId === 1 && cmapEncodingId === 0) ||
(cmapPlatformId === -1 && cmapEncodingId === -1 && // Temporary hack (cmapPlatformId === -1 && cmapEncodingId === -1 && // Temporary hack
@ -2562,6 +2595,12 @@ var Font = (function FontClosure() {
// TODO: Check the charstring widths to determine this. // TODO: Check the charstring widths to determine this.
properties.fixedPitch = false; properties.fixedPitch = false;
if (properties.builtInEncoding) {
// For Type1 fonts that do not include either `ToUnicode` or `Encoding`
// data, attempt to use the `builtInEncoding` to improve text selection.
adjustToUnicode(properties, properties.builtInEncoding);
}
var mapping = font.getGlyphMapping(properties); var mapping = font.getGlyphMapping(properties);
var newMapping = adjustMapping(mapping, properties); var newMapping = adjustMapping(mapping, properties);
this.toFontChar = newMapping.toFontChar; this.toFontChar = newMapping.toFontChar;

1
test/pdfs/.gitignore vendored

@ -22,6 +22,7 @@
!issue5808.pdf !issue5808.pdf
!issue6204.pdf !issue6204.pdf
!issue6782.pdf !issue6782.pdf
!issue6901.pdf
!issue6961.pdf !issue6961.pdf
!issue6962.pdf !issue6962.pdf
!issue7020.pdf !issue7020.pdf

BIN
test/pdfs/issue6901.pdf

Binary file not shown.

14
test/test_manifest.json

@ -1220,6 +1220,20 @@
"link": false, "link": false,
"type": "text" "type": "text"
}, },
{ "id": "issue6901-eq",
"file": "pdfs/issue6901.pdf",
"md5": "1a0604b1a7a3aaf2162b425a9a84230b",
"rounds": 1,
"link": false,
"type": "eq"
},
{ "id": "issue6901-text",
"file": "pdfs/issue6901.pdf",
"md5": "1a0604b1a7a3aaf2162b425a9a84230b",
"rounds": 1,
"link": false,
"type": "text"
},
{ "id": "issue6962", { "id": "issue6962",
"file": "pdfs/issue6962.pdf", "file": "pdfs/issue6962.pdf",
"md5": "d40e871ecca68baf93114bd28c782148", "md5": "d40e871ecca68baf93114bd28c782148",

Loading…
Cancel
Save