From 9bc872a2b8879c072490312d5de44edf90c4a7a2 Mon Sep 17 00:00:00 2001 From: Jaymes Lauser Date: Fri, 27 Jul 2018 00:15:24 -0700 Subject: [PATCH] Fix linebreak problem in headers (#542) --- papaparse.js | 15 +++-- tests/test-cases.js | 134 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 5 deletions(-) diff --git a/papaparse.js b/papaparse.js index 5f670bd..23c7889 100755 --- a/papaparse.js +++ b/papaparse.js @@ -1008,8 +1008,9 @@ */ this.parse = function(input, baseIndex, ignoreLastRow) { + var quoteChar = _config.quoteChar || '"'; if (!_config.newline) - _config.newline = guessLineEndings(input); + _config.newline = guessLineEndings(input, quoteChar); _delimiterError = false; if (!_config.delimiter) @@ -1252,9 +1253,12 @@ }; } - function guessLineEndings(input) + function guessLineEndings(input, quoteChar) { input = input.substr(0, 1024 * 1024); // max length 1 MB + // Replace all the text inside quotes + var re = new RegExp(escapeRegExp(quoteChar) + '([^]*?)' + escapeRegExp(quoteChar), 'gm'); + input = input.replace(re, ''); var r = input.split('\r'); @@ -1286,9 +1290,10 @@ } } - - - + /** https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions */ + function escapeRegExp(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string + } /** The core parser implements speedy and correct CSV parsing */ function Parser(config) diff --git a/tests/test-cases.js b/tests/test-cases.js index ad0d9a5..df5c152 100644 --- a/tests/test-cases.js +++ b/tests/test-cases.js @@ -1208,6 +1208,136 @@ var PARSE_TESTS = [ data: [{'a': 'c', 'b': 'd'}], errors: [] } + }, + { + description: "Carriage return in header inside quotes, with line feed endings", + input: '"a\r\na","b"\n"c","d"\n"e","f"\n"g","h"\n"i","j"', + config: {}, + expected: { + data: [['a\r\na', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [] + } + }, + { + description: "Line feed in header inside quotes, with carriage return + line feed endings", + input: '"a\na","b"\r\n"c","d"\r\n"e","f"\r\n"g","h"\r\n"i","j"', + config: {}, + expected: { + data: [['a\na', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [] + } + }, + { + description: "Using \\r\\n endings uses \\r\\n linebreak", + input: 'a,b\r\nc,d\r\ne,f\r\ng,h\r\ni,j', + config: {}, + expected: { + data: [['a', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [], + meta: { + linebreak: '\r\n', + delimiter: ',', + cursor: 23, + aborted: false, + truncated: false + } + } + }, + { + description: "Using \\n endings uses \\n linebreak", + input: 'a,b\nc,d\ne,f\ng,h\ni,j', + config: {}, + expected: { + data: [['a', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [], + meta: { + linebreak: '\n', + delimiter: ',', + cursor: 19, + aborted: false, + truncated: false + } + } + }, + { + description: "Using \\r\\n endings with \\r\\n in header field uses \\r\\n linebreak", + input: '"a\r\na",b\r\nc,d\r\ne,f\r\ng,h\r\ni,j', + config: {}, + expected: { + data: [['a\r\na', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [], + meta: { + linebreak: '\r\n', + delimiter: ',', + cursor: 28, + aborted: false, + truncated: false + } + } + }, + { + description: "Using \\r\\n endings with \\n in header field uses \\r\\n linebreak", + input: '"a\na",b\r\nc,d\r\ne,f\r\ng,h\r\ni,j', + config: {}, + expected: { + data: [['a\na', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [], + meta: { + linebreak: '\r\n', + delimiter: ',', + cursor: 27, + aborted: false, + truncated: false + } + } + }, + { + description: "Using \\r\\n endings with \\n in header field with skip empty lines uses \\r\\n linebreak", + input: '"a\na",b\r\nc,d\r\ne,f\r\ng,h\r\ni,j\r\n', + config: {skipEmptyLines: true}, + expected: { + data: [['a\na', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [], + meta: { + linebreak: '\r\n', + delimiter: ',', + cursor: 29, + aborted: false, + truncated: false + } + } + }, + { + description: "Using \\n endings with \\r\\n in header field uses \\n linebreak", + input: '"a\r\na",b\nc,d\ne,f\ng,h\ni,j', + config: {}, + expected: { + data: [['a\r\na', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [], + meta: { + linebreak: '\n', + delimiter: ',', + cursor: 24, + aborted: false, + truncated: false + } + } + }, + { + description: "Using reserved regex characters as quote characters", + input: '.a\na.,b\r\nc,d\r\ne,f\r\ng,h\r\ni,j', + config: { quoteChar: '.' }, + expected: { + data: [['a\na', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h'], ['i', 'j']], + errors: [], + meta: { + linebreak: '\r\n', + delimiter: ',', + cursor: 27, + aborted: false, + truncated: false + } + } } ]; @@ -1215,6 +1345,10 @@ describe('Parse Tests', function() { function generateTest(test) { (test.disabled ? it.skip : it)(test.description, function() { var actual = Papa.parse(test.input, test.config); + // allows for testing the meta object if present in the test + if (test.expected.meta) { + assert.deepEqual(actual.meta, test.expected.meta); + } assert.deepEqual(JSON.stringify(actual.errors), JSON.stringify(test.expected.errors)); assert.deepEqual(actual.data, test.expected.data); });