From 089d188084647a34e27aec1e764f60fd97c0de05 Mon Sep 17 00:00:00 2001 From: Braden Anderson Date: Thu, 11 Dec 2014 13:29:56 -0700 Subject: [PATCH] use parser to find last row end in chunks --- papaparse.js | 95 +++++++++++++++----------------------------- tests/test-cases.js | 88 ++++++++++++++++++++++++++++++++++++---- tests/test-runner.js | 12 ++++-- 3 files changed, 120 insertions(+), 75 deletions(-) diff --git a/papaparse.js b/papaparse.js index a04021d..b3f7bd7 100644 --- a/papaparse.js +++ b/papaparse.js @@ -522,34 +522,16 @@ finishedWithEntireFile = (!config.step && !config.chunk) || start > getFileSize(xhr); - var lastLineEnd; + var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile); + var lastIndex = results.meta.cursor; if (!finishedWithEntireFile) { - lastLineEnd = aggregate.lastIndexOf("\r"); - - if (lastLineEnd == -1) - lastLineEnd = aggregate.lastIndexOf("\n"); - - if (lastLineEnd != -1) - { - partialLine = aggregate.substring(lastLineEnd + 1); // skip the line ending character - aggregate = aggregate.substring(0, lastLineEnd); - } - else - { - // For chunk sizes smaller than a line (a line could not fit in a single chunk) - // we simply build our aggregate by reading in the next chunk, until we find a newline - nextChunk(); - return; - } + partialLine = aggregate.substring(lastIndex - baseIndex); + baseIndex = lastIndex; } - - var results = handle.parse(aggregate, baseIndex); - aggregate = ""; - if (!finishedWithEntireFile) - baseIndex += lastLineEnd + 1; if (results && results.data) rowCount += results.data.length; + aggregate = ""; var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview); @@ -705,34 +687,16 @@ finishedWithEntireFile = start >= file.size; - var lastLineEnd; + var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile); + var lastIndex = results.meta.cursor; if (!finishedWithEntireFile) { - lastLineEnd = aggregate.lastIndexOf("\r"); // TODO: Use an auto-detected line ending? - - if (lastLineEnd == -1) - lastLineEnd = aggregate.lastIndexOf("\n"); - - if (lastLineEnd != -1) - { - partialLine = aggregate.substring(lastLineEnd + 1); // skip the line ending character (TODO: Not always length 1? \r\n...) - aggregate = aggregate.substring(0, lastLineEnd); - } - else - { - // For chunk sizes smaller than a line (a line could not fit in a single chunk) - // we simply build our aggregate by reading in the next chunk, until we find a newline - nextChunk(); - return; - } + partialLine = aggregate.substring(lastIndex - baseIndex); + baseIndex = lastIndex; } - - var results = handle.parse(aggregate, baseIndex); - aggregate = ""; - if (!finishedWithEntireFile) - baseIndex += lastLineEnd + 1; if (results && results.data) rowCount += results.data.length; + aggregate = ""; var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview); @@ -839,7 +803,7 @@ }; } - this.parse = function(input, baseIndex) + this.parse = function(input, baseIndex, ignoreLastRow) { if (!_config.newline) _config.newline = guessLineEndings(input); @@ -864,7 +828,7 @@ _input = input; _parser = new Parser(parserConfig); - _results = _parser.parse(_input, baseIndex); + _results = _parser.parse(_input, baseIndex, ignoreLastRow); processResults(); if (isFunction(_config.complete) && !_paused && (!self.streamer || self.streamer.finished())) _config.complete(_results); @@ -1112,7 +1076,7 @@ var cursor = 0; var aborted = false; - this.parse = function(input, baseIndex) + this.parse = function(input, baseIndex, ignoreLastRow) { // For some reason, in Chrome, this speeds things up (!?) if (typeof input !== 'string') @@ -1182,25 +1146,24 @@ if (quoteSearch === -1) { - // No closing quote... what a pity - errors.push({ - type: "Quotes", - code: "MissingQuotes", - message: "Quoted field unterminated", - row: data.length, // row has yet to be inserted - index: cursor - }); + if (!ignoreLastRow) { + // No closing quote... what a pity + errors.push({ + type: "Quotes", + code: "MissingQuotes", + message: "Quoted field unterminated", + row: data.length, // row has yet to be inserted + index: cursor + }); + } return finish(); } if (quoteSearch === inputLen-1) { // Closing quote at EOF - row.push(input.substring(cursor, quoteSearch).replace(/""/g, '"')); - pushRow(row); - if (stepIsFunction) - doStep(); - return returnable(); + var value = input.substring(cursor, quoteSearch).replace(/""/g, '"'); + return finish(value); } // If this quote is escaped, it's part of the data; skip it @@ -1298,9 +1261,13 @@ // Appends the remaining input from cursor to the end into // row, saves the row, calls step, and returns the results. - function finish() + function finish(value) { - row.push(input.substr(cursor)); + if (ignoreLastRow) + return returnable(); + if (!value) + value = input.substr(cursor); + row.push(value); cursor = inputLen; // important in case parsing is paused pushRow(row); if (stepIsFunction) diff --git a/tests/test-cases.js b/tests/test-cases.js index caf4d27..8497dc9 100644 --- a/tests/test-cases.js +++ b/tests/test-cases.js @@ -1,5 +1,10 @@ var RECORD_SEP = String.fromCharCode(30); var UNIT_SEP = String.fromCharCode(31); +var FILES_ENABLED = false; +try { + new File([""], ""); + FILES_ENABLED = true; +} catch (e) {} // safari, ie // Tests for the core parser using new Papa.Parser().parse() (CSV to JSON) var CORE_PARSER_TESTS = [ @@ -845,7 +850,28 @@ var PARSE_ASYNC_TESTS = [ data: [['A','B','C'],['X','Y','Z']], errors: [] } - } + }, + { + description: "Simple file", + input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false, + config: { + }, + expected: { + data: [['A','B','C'],['X','Y','Z']], + errors: [] + } + }, + { + description: "Simple file + worker", + input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false, + config: { + worker: true, + }, + expected: { + data: [['A','B','C'],['X','Y','Z']], + errors: [] + } + } ]; @@ -1045,7 +1071,7 @@ var CUSTOM_TESTS = [ } }, { - description: "Step exposes cursor for files", + description: "Step exposes cursor for downloads", expected: [129, 287, 452, 595, 727, 865, 1031, 1209], run: function(callback) { var updates = []; @@ -1060,9 +1086,8 @@ var CUSTOM_TESTS = [ } }, { - description: "Step exposes cursor for chunked files", - // Tiny inconsistency: the last full row in each chunk will not see a newline. - expected: [129, 287, 451, 595, 727, 864, 1031, 1209], + description: "Step exposes cursor for chunked downloads", + expected: [129, 287, 452, 595, 727, 865, 1031, 1209], run: function(callback) { var updates = []; Papa.parse("/tests/long-sample.csv", { @@ -1078,8 +1103,7 @@ var CUSTOM_TESTS = [ }, { description: "Step exposes cursor for workers", - // You're only really getting chunk cursors here. - expected: [451, 451, 451, 864, 864, 864, 1209, 1209], + expected: [452, 452, 452, 865, 865, 865, 1209, 1209], run: function(callback) { var updates = []; Papa.parse("/tests/long-sample.csv", { @@ -1112,7 +1136,7 @@ var CUSTOM_TESTS = [ }, { description: "Chunk is called with cursor position", - expected: [451, 864, 1209], + expected: [452, 865, 1209], run: function(callback) { var updates = []; Papa.parse("/tests/long-sample.csv", { @@ -1126,4 +1150,52 @@ var CUSTOM_TESTS = [ }); } }, + { + description: "Step exposes indexes for files", + expected: [6, 12, 17], + disabled: !FILES_ENABLED, + run: function(callback) { + var updates = []; + Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), { + download: true, + step: function(response) { + updates.push(response.indexes[0]); + }, complete: function() { + callback(updates); + } + }); + } + }, + { + description: "Step exposes indexes for chunked files", + expected: [6, 12, 17], + disabled: !FILES_ENABLED, + run: function(callback) { + var updates = []; + Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), { + chunkSize: 3, + step: function(response) { + updates.push(response.indexes[0]); + }, complete: function() { + callback(updates); + } + }); + } + }, + { + description: "Quoted line breaks near chunk boundaries are handled", + expected: [['A', 'B', 'C'], ['X', 'Y\n1\n2\n3', 'Z']], + disabled: !FILES_ENABLED, + run: function(callback) { + var updates = []; + Papa.parse(new File(['A,B,C\nX,"Y\n1\n2\n3",Z'], 'sample.csv'), { + chunkSize: 3, + step: function(response) { + updates.push(response.data[0]); + }, complete: function() { + callback(updates); + } + }); + } + } ]; diff --git a/tests/test-runner.js b/tests/test-runner.js index cd3c264..ec8f2ce 100644 --- a/tests/test-runner.js +++ b/tests/test-runner.js @@ -100,7 +100,7 @@ function runParseTests(asyncDone) { var results = compare(actual.data, actual.errors, test.expected); - displayResults("tests-for-parse", test, actual, results); + displayResults("#tests-for-parse", test, actual, results); if (results.data.passed && results.errors.passed) { passCount++; @@ -115,7 +115,7 @@ function runParseTests(asyncDone) config.error = function(err) { failCount++; - displayResults(test, {data:[],errors:err}, test.expected); + displayResults("#tests-for-parse", test, {data:[],errors:err}, test.expected); if (--asyncRemaining === 0) { asyncDone(); } @@ -308,7 +308,7 @@ function runUnparseTests() // and renders results in the table. function runCustomTests(asyncDone) { - var asyncRemaining = CUSTOM_TESTS.length; + var asyncRemaining = 0; for (var i = 0; i < CUSTOM_TESTS.length; i++) { runTest(CUSTOM_TESTS[i]); @@ -316,6 +316,9 @@ function runCustomTests(asyncDone) function runTest(test) { + if (test.disabled) + return; + asyncRemaining++; try { test.run(function(actual) { @@ -391,6 +394,9 @@ function passOrFailTd(result) // Reveals some hidden, whitespace, or invisible characters function revealChars(txt) { + if (typeof txt != 'string') + return '(file)'; + // Make spaces and tabs more obvious when glancing txt = txt.replace(/( |\t)/ig, '$1'); txt = txt.replace(/(\r\n|\n\r|\r|\n)/ig, '$1$1');