Browse Source

use parser to find last row end in chunks

pull/135/head
Braden Anderson 10 years ago
parent
commit
089d188084
  1. 95
      papaparse.js
  2. 88
      tests/test-cases.js
  3. 12
      tests/test-runner.js

95
papaparse.js

@ -522,34 +522,16 @@
finishedWithEntireFile = (!config.step && !config.chunk) || start > getFileSize(xhr); finishedWithEntireFile = (!config.step && !config.chunk) || start > getFileSize(xhr);
var lastLineEnd; var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile);
var lastIndex = results.meta.cursor;
if (!finishedWithEntireFile) if (!finishedWithEntireFile)
{ {
lastLineEnd = aggregate.lastIndexOf("\r"); partialLine = aggregate.substring(lastIndex - baseIndex);
baseIndex = lastIndex;
if (lastLineEnd == -1)
lastLineEnd = aggregate.lastIndexOf("\n");
if (lastLineEnd != -1)
{
partialLine = aggregate.substring(lastLineEnd + 1); // skip the line ending character
aggregate = aggregate.substring(0, lastLineEnd);
}
else
{
// For chunk sizes smaller than a line (a line could not fit in a single chunk)
// we simply build our aggregate by reading in the next chunk, until we find a newline
nextChunk();
return;
}
} }
var results = handle.parse(aggregate, baseIndex);
aggregate = "";
if (!finishedWithEntireFile)
baseIndex += lastLineEnd + 1;
if (results && results.data) if (results && results.data)
rowCount += results.data.length; rowCount += results.data.length;
aggregate = "";
var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview); var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview);
@ -705,34 +687,16 @@
finishedWithEntireFile = start >= file.size; finishedWithEntireFile = start >= file.size;
var lastLineEnd; var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile);
var lastIndex = results.meta.cursor;
if (!finishedWithEntireFile) if (!finishedWithEntireFile)
{ {
lastLineEnd = aggregate.lastIndexOf("\r"); // TODO: Use an auto-detected line ending? partialLine = aggregate.substring(lastIndex - baseIndex);
baseIndex = lastIndex;
if (lastLineEnd == -1)
lastLineEnd = aggregate.lastIndexOf("\n");
if (lastLineEnd != -1)
{
partialLine = aggregate.substring(lastLineEnd + 1); // skip the line ending character (TODO: Not always length 1? \r\n...)
aggregate = aggregate.substring(0, lastLineEnd);
}
else
{
// For chunk sizes smaller than a line (a line could not fit in a single chunk)
// we simply build our aggregate by reading in the next chunk, until we find a newline
nextChunk();
return;
}
} }
var results = handle.parse(aggregate, baseIndex);
aggregate = "";
if (!finishedWithEntireFile)
baseIndex += lastLineEnd + 1;
if (results && results.data) if (results && results.data)
rowCount += results.data.length; rowCount += results.data.length;
aggregate = "";
var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview); var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview);
@ -839,7 +803,7 @@
}; };
} }
this.parse = function(input, baseIndex) this.parse = function(input, baseIndex, ignoreLastRow)
{ {
if (!_config.newline) if (!_config.newline)
_config.newline = guessLineEndings(input); _config.newline = guessLineEndings(input);
@ -864,7 +828,7 @@
_input = input; _input = input;
_parser = new Parser(parserConfig); _parser = new Parser(parserConfig);
_results = _parser.parse(_input, baseIndex); _results = _parser.parse(_input, baseIndex, ignoreLastRow);
processResults(); processResults();
if (isFunction(_config.complete) && !_paused && (!self.streamer || self.streamer.finished())) if (isFunction(_config.complete) && !_paused && (!self.streamer || self.streamer.finished()))
_config.complete(_results); _config.complete(_results);
@ -1112,7 +1076,7 @@
var cursor = 0; var cursor = 0;
var aborted = false; var aborted = false;
this.parse = function(input, baseIndex) this.parse = function(input, baseIndex, ignoreLastRow)
{ {
// For some reason, in Chrome, this speeds things up (!?) // For some reason, in Chrome, this speeds things up (!?)
if (typeof input !== 'string') if (typeof input !== 'string')
@ -1182,25 +1146,24 @@
if (quoteSearch === -1) if (quoteSearch === -1)
{ {
// No closing quote... what a pity if (!ignoreLastRow) {
errors.push({ // No closing quote... what a pity
type: "Quotes", errors.push({
code: "MissingQuotes", type: "Quotes",
message: "Quoted field unterminated", code: "MissingQuotes",
row: data.length, // row has yet to be inserted message: "Quoted field unterminated",
index: cursor row: data.length, // row has yet to be inserted
}); index: cursor
});
}
return finish(); return finish();
} }
if (quoteSearch === inputLen-1) if (quoteSearch === inputLen-1)
{ {
// Closing quote at EOF // Closing quote at EOF
row.push(input.substring(cursor, quoteSearch).replace(/""/g, '"')); var value = input.substring(cursor, quoteSearch).replace(/""/g, '"');
pushRow(row); return finish(value);
if (stepIsFunction)
doStep();
return returnable();
} }
// If this quote is escaped, it's part of the data; skip it // If this quote is escaped, it's part of the data; skip it
@ -1298,9 +1261,13 @@
// Appends the remaining input from cursor to the end into // Appends the remaining input from cursor to the end into
// row, saves the row, calls step, and returns the results. // row, saves the row, calls step, and returns the results.
function finish() function finish(value)
{ {
row.push(input.substr(cursor)); if (ignoreLastRow)
return returnable();
if (!value)
value = input.substr(cursor);
row.push(value);
cursor = inputLen; // important in case parsing is paused cursor = inputLen; // important in case parsing is paused
pushRow(row); pushRow(row);
if (stepIsFunction) if (stepIsFunction)

88
tests/test-cases.js

@ -1,5 +1,10 @@
var RECORD_SEP = String.fromCharCode(30); var RECORD_SEP = String.fromCharCode(30);
var UNIT_SEP = String.fromCharCode(31); var UNIT_SEP = String.fromCharCode(31);
var FILES_ENABLED = false;
try {
new File([""], "");
FILES_ENABLED = true;
} catch (e) {} // safari, ie
// Tests for the core parser using new Papa.Parser().parse() (CSV to JSON) // Tests for the core parser using new Papa.Parser().parse() (CSV to JSON)
var CORE_PARSER_TESTS = [ var CORE_PARSER_TESTS = [
@ -845,7 +850,28 @@ var PARSE_ASYNC_TESTS = [
data: [['A','B','C'],['X','Y','Z']], data: [['A','B','C'],['X','Y','Z']],
errors: [] errors: []
} }
} },
{
description: "Simple file",
input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false,
config: {
},
expected: {
data: [['A','B','C'],['X','Y','Z']],
errors: []
}
},
{
description: "Simple file + worker",
input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false,
config: {
worker: true,
},
expected: {
data: [['A','B','C'],['X','Y','Z']],
errors: []
}
}
]; ];
@ -1045,7 +1071,7 @@ var CUSTOM_TESTS = [
} }
}, },
{ {
description: "Step exposes cursor for files", description: "Step exposes cursor for downloads",
expected: [129, 287, 452, 595, 727, 865, 1031, 1209], expected: [129, 287, 452, 595, 727, 865, 1031, 1209],
run: function(callback) { run: function(callback) {
var updates = []; var updates = [];
@ -1060,9 +1086,8 @@ var CUSTOM_TESTS = [
} }
}, },
{ {
description: "Step exposes cursor for chunked files", description: "Step exposes cursor for chunked downloads",
// Tiny inconsistency: the last full row in each chunk will not see a newline. expected: [129, 287, 452, 595, 727, 865, 1031, 1209],
expected: [129, 287, 451, 595, 727, 864, 1031, 1209],
run: function(callback) { run: function(callback) {
var updates = []; var updates = [];
Papa.parse("/tests/long-sample.csv", { Papa.parse("/tests/long-sample.csv", {
@ -1078,8 +1103,7 @@ var CUSTOM_TESTS = [
}, },
{ {
description: "Step exposes cursor for workers", description: "Step exposes cursor for workers",
// You're only really getting chunk cursors here. expected: [452, 452, 452, 865, 865, 865, 1209, 1209],
expected: [451, 451, 451, 864, 864, 864, 1209, 1209],
run: function(callback) { run: function(callback) {
var updates = []; var updates = [];
Papa.parse("/tests/long-sample.csv", { Papa.parse("/tests/long-sample.csv", {
@ -1112,7 +1136,7 @@ var CUSTOM_TESTS = [
}, },
{ {
description: "Chunk is called with cursor position", description: "Chunk is called with cursor position",
expected: [451, 864, 1209], expected: [452, 865, 1209],
run: function(callback) { run: function(callback) {
var updates = []; var updates = [];
Papa.parse("/tests/long-sample.csv", { Papa.parse("/tests/long-sample.csv", {
@ -1126,4 +1150,52 @@ var CUSTOM_TESTS = [
}); });
} }
}, },
{
description: "Step exposes indexes for files",
expected: [6, 12, 17],
disabled: !FILES_ENABLED,
run: function(callback) {
var updates = [];
Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), {
download: true,
step: function(response) {
updates.push(response.indexes[0]);
}, complete: function() {
callback(updates);
}
});
}
},
{
description: "Step exposes indexes for chunked files",
expected: [6, 12, 17],
disabled: !FILES_ENABLED,
run: function(callback) {
var updates = [];
Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), {
chunkSize: 3,
step: function(response) {
updates.push(response.indexes[0]);
}, complete: function() {
callback(updates);
}
});
}
},
{
description: "Quoted line breaks near chunk boundaries are handled",
expected: [['A', 'B', 'C'], ['X', 'Y\n1\n2\n3', 'Z']],
disabled: !FILES_ENABLED,
run: function(callback) {
var updates = [];
Papa.parse(new File(['A,B,C\nX,"Y\n1\n2\n3",Z'], 'sample.csv'), {
chunkSize: 3,
step: function(response) {
updates.push(response.data[0]);
}, complete: function() {
callback(updates);
}
});
}
}
]; ];

12
tests/test-runner.js

@ -100,7 +100,7 @@ function runParseTests(asyncDone)
{ {
var results = compare(actual.data, actual.errors, test.expected); var results = compare(actual.data, actual.errors, test.expected);
displayResults("tests-for-parse", test, actual, results); displayResults("#tests-for-parse", test, actual, results);
if (results.data.passed && results.errors.passed) { if (results.data.passed && results.errors.passed) {
passCount++; passCount++;
@ -115,7 +115,7 @@ function runParseTests(asyncDone)
config.error = function(err) config.error = function(err)
{ {
failCount++; failCount++;
displayResults(test, {data:[],errors:err}, test.expected); displayResults("#tests-for-parse", test, {data:[],errors:err}, test.expected);
if (--asyncRemaining === 0) { if (--asyncRemaining === 0) {
asyncDone(); asyncDone();
} }
@ -308,7 +308,7 @@ function runUnparseTests()
// and renders results in the table. // and renders results in the table.
function runCustomTests(asyncDone) function runCustomTests(asyncDone)
{ {
var asyncRemaining = CUSTOM_TESTS.length; var asyncRemaining = 0;
for (var i = 0; i < CUSTOM_TESTS.length; i++) for (var i = 0; i < CUSTOM_TESTS.length; i++)
{ {
runTest(CUSTOM_TESTS[i]); runTest(CUSTOM_TESTS[i]);
@ -316,6 +316,9 @@ function runCustomTests(asyncDone)
function runTest(test) function runTest(test)
{ {
if (test.disabled)
return;
asyncRemaining++;
try try
{ {
test.run(function(actual) { test.run(function(actual) {
@ -391,6 +394,9 @@ function passOrFailTd(result)
// Reveals some hidden, whitespace, or invisible characters // Reveals some hidden, whitespace, or invisible characters
function revealChars(txt) function revealChars(txt)
{ {
if (typeof txt != 'string')
return '(file)';
// Make spaces and tabs more obvious when glancing // Make spaces and tabs more obvious when glancing
txt = txt.replace(/( |\t)/ig, '<span class="whitespace-char">$1</span>'); txt = txt.replace(/( |\t)/ig, '<span class="whitespace-char">$1</span>');
txt = txt.replace(/(\r\n|\n\r|\r|\n)/ig, '<span class="whitespace-char special-char">$1</span>$1'); txt = txt.replace(/(\r\n|\n\r|\r|\n)/ig, '<span class="whitespace-char special-char">$1</span>$1');

Loading…
Cancel
Save