Browse Source

Merge pull request #135 from bluej100/chunkboundary

use parser to find last row end in chunks
pull/136/head
Matt Holt 10 years ago
parent
commit
11e84dfaa7
  1. 95
      papaparse.js
  2. 88
      tests/test-cases.js
  3. 12
      tests/test-runner.js

95
papaparse.js

@ -522,34 +522,16 @@ @@ -522,34 +522,16 @@
finishedWithEntireFile = (!config.step && !config.chunk) || start > getFileSize(xhr);
var lastLineEnd;
var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile);
var lastIndex = results.meta.cursor;
if (!finishedWithEntireFile)
{
lastLineEnd = aggregate.lastIndexOf("\r");
if (lastLineEnd == -1)
lastLineEnd = aggregate.lastIndexOf("\n");
if (lastLineEnd != -1)
{
partialLine = aggregate.substring(lastLineEnd + 1); // skip the line ending character
aggregate = aggregate.substring(0, lastLineEnd);
}
else
{
// For chunk sizes smaller than a line (a line could not fit in a single chunk)
// we simply build our aggregate by reading in the next chunk, until we find a newline
nextChunk();
return;
}
partialLine = aggregate.substring(lastIndex - baseIndex);
baseIndex = lastIndex;
}
var results = handle.parse(aggregate, baseIndex);
aggregate = "";
if (!finishedWithEntireFile)
baseIndex += lastLineEnd + 1;
if (results && results.data)
rowCount += results.data.length;
aggregate = "";
var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview);
@ -705,34 +687,16 @@ @@ -705,34 +687,16 @@
finishedWithEntireFile = start >= file.size;
var lastLineEnd;
var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile);
var lastIndex = results.meta.cursor;
if (!finishedWithEntireFile)
{
lastLineEnd = aggregate.lastIndexOf("\r"); // TODO: Use an auto-detected line ending?
if (lastLineEnd == -1)
lastLineEnd = aggregate.lastIndexOf("\n");
if (lastLineEnd != -1)
{
partialLine = aggregate.substring(lastLineEnd + 1); // skip the line ending character (TODO: Not always length 1? \r\n...)
aggregate = aggregate.substring(0, lastLineEnd);
}
else
{
// For chunk sizes smaller than a line (a line could not fit in a single chunk)
// we simply build our aggregate by reading in the next chunk, until we find a newline
nextChunk();
return;
}
partialLine = aggregate.substring(lastIndex - baseIndex);
baseIndex = lastIndex;
}
var results = handle.parse(aggregate, baseIndex);
aggregate = "";
if (!finishedWithEntireFile)
baseIndex += lastLineEnd + 1;
if (results && results.data)
rowCount += results.data.length;
aggregate = "";
var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview);
@ -839,7 +803,7 @@ @@ -839,7 +803,7 @@
};
}
this.parse = function(input, baseIndex)
this.parse = function(input, baseIndex, ignoreLastRow)
{
if (!_config.newline)
_config.newline = guessLineEndings(input);
@ -864,7 +828,7 @@ @@ -864,7 +828,7 @@
_input = input;
_parser = new Parser(parserConfig);
_results = _parser.parse(_input, baseIndex);
_results = _parser.parse(_input, baseIndex, ignoreLastRow);
processResults();
if (isFunction(_config.complete) && !_paused && (!self.streamer || self.streamer.finished()))
_config.complete(_results);
@ -1112,7 +1076,7 @@ @@ -1112,7 +1076,7 @@
var cursor = 0;
var aborted = false;
this.parse = function(input, baseIndex)
this.parse = function(input, baseIndex, ignoreLastRow)
{
// For some reason, in Chrome, this speeds things up (!?)
if (typeof input !== 'string')
@ -1182,25 +1146,24 @@ @@ -1182,25 +1146,24 @@
if (quoteSearch === -1)
{
// No closing quote... what a pity
errors.push({
type: "Quotes",
code: "MissingQuotes",
message: "Quoted field unterminated",
row: data.length, // row has yet to be inserted
index: cursor
});
if (!ignoreLastRow) {
// No closing quote... what a pity
errors.push({
type: "Quotes",
code: "MissingQuotes",
message: "Quoted field unterminated",
row: data.length, // row has yet to be inserted
index: cursor
});
}
return finish();
}
if (quoteSearch === inputLen-1)
{
// Closing quote at EOF
row.push(input.substring(cursor, quoteSearch).replace(/""/g, '"'));
pushRow(row);
if (stepIsFunction)
doStep();
return returnable();
var value = input.substring(cursor, quoteSearch).replace(/""/g, '"');
return finish(value);
}
// If this quote is escaped, it's part of the data; skip it
@ -1298,9 +1261,13 @@ @@ -1298,9 +1261,13 @@
// Appends the remaining input from cursor to the end into
// row, saves the row, calls step, and returns the results.
function finish()
function finish(value)
{
row.push(input.substr(cursor));
if (ignoreLastRow)
return returnable();
if (!value)
value = input.substr(cursor);
row.push(value);
cursor = inputLen; // important in case parsing is paused
pushRow(row);
if (stepIsFunction)

88
tests/test-cases.js

@ -1,5 +1,10 @@ @@ -1,5 +1,10 @@
var RECORD_SEP = String.fromCharCode(30);
var UNIT_SEP = String.fromCharCode(31);
var FILES_ENABLED = false;
try {
new File([""], "");
FILES_ENABLED = true;
} catch (e) {} // safari, ie
// Tests for the core parser using new Papa.Parser().parse() (CSV to JSON)
var CORE_PARSER_TESTS = [
@ -845,7 +850,28 @@ var PARSE_ASYNC_TESTS = [ @@ -845,7 +850,28 @@ var PARSE_ASYNC_TESTS = [
data: [['A','B','C'],['X','Y','Z']],
errors: []
}
}
},
{
description: "Simple file",
input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false,
config: {
},
expected: {
data: [['A','B','C'],['X','Y','Z']],
errors: []
}
},
{
description: "Simple file + worker",
input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false,
config: {
worker: true,
},
expected: {
data: [['A','B','C'],['X','Y','Z']],
errors: []
}
}
];
@ -1045,7 +1071,7 @@ var CUSTOM_TESTS = [ @@ -1045,7 +1071,7 @@ var CUSTOM_TESTS = [
}
},
{
description: "Step exposes cursor for files",
description: "Step exposes cursor for downloads",
expected: [129, 287, 452, 595, 727, 865, 1031, 1209],
run: function(callback) {
var updates = [];
@ -1060,9 +1086,8 @@ var CUSTOM_TESTS = [ @@ -1060,9 +1086,8 @@ var CUSTOM_TESTS = [
}
},
{
description: "Step exposes cursor for chunked files",
// Tiny inconsistency: the last full row in each chunk will not see a newline.
expected: [129, 287, 451, 595, 727, 864, 1031, 1209],
description: "Step exposes cursor for chunked downloads",
expected: [129, 287, 452, 595, 727, 865, 1031, 1209],
run: function(callback) {
var updates = [];
Papa.parse("/tests/long-sample.csv", {
@ -1078,8 +1103,7 @@ var CUSTOM_TESTS = [ @@ -1078,8 +1103,7 @@ var CUSTOM_TESTS = [
},
{
description: "Step exposes cursor for workers",
// You're only really getting chunk cursors here.
expected: [451, 451, 451, 864, 864, 864, 1209, 1209],
expected: [452, 452, 452, 865, 865, 865, 1209, 1209],
run: function(callback) {
var updates = [];
Papa.parse("/tests/long-sample.csv", {
@ -1112,7 +1136,7 @@ var CUSTOM_TESTS = [ @@ -1112,7 +1136,7 @@ var CUSTOM_TESTS = [
},
{
description: "Chunk is called with cursor position",
expected: [451, 864, 1209],
expected: [452, 865, 1209],
run: function(callback) {
var updates = [];
Papa.parse("/tests/long-sample.csv", {
@ -1126,4 +1150,52 @@ var CUSTOM_TESTS = [ @@ -1126,4 +1150,52 @@ var CUSTOM_TESTS = [
});
}
},
{
description: "Step exposes indexes for files",
expected: [6, 12, 17],
disabled: !FILES_ENABLED,
run: function(callback) {
var updates = [];
Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), {
download: true,
step: function(response) {
updates.push(response.indexes[0]);
}, complete: function() {
callback(updates);
}
});
}
},
{
description: "Step exposes indexes for chunked files",
expected: [6, 12, 17],
disabled: !FILES_ENABLED,
run: function(callback) {
var updates = [];
Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), {
chunkSize: 3,
step: function(response) {
updates.push(response.indexes[0]);
}, complete: function() {
callback(updates);
}
});
}
},
{
description: "Quoted line breaks near chunk boundaries are handled",
expected: [['A', 'B', 'C'], ['X', 'Y\n1\n2\n3', 'Z']],
disabled: !FILES_ENABLED,
run: function(callback) {
var updates = [];
Papa.parse(new File(['A,B,C\nX,"Y\n1\n2\n3",Z'], 'sample.csv'), {
chunkSize: 3,
step: function(response) {
updates.push(response.data[0]);
}, complete: function() {
callback(updates);
}
});
}
}
];

12
tests/test-runner.js

@ -100,7 +100,7 @@ function runParseTests(asyncDone) @@ -100,7 +100,7 @@ function runParseTests(asyncDone)
{
var results = compare(actual.data, actual.errors, test.expected);
displayResults("tests-for-parse", test, actual, results);
displayResults("#tests-for-parse", test, actual, results);
if (results.data.passed && results.errors.passed) {
passCount++;
@ -115,7 +115,7 @@ function runParseTests(asyncDone) @@ -115,7 +115,7 @@ function runParseTests(asyncDone)
config.error = function(err)
{
failCount++;
displayResults(test, {data:[],errors:err}, test.expected);
displayResults("#tests-for-parse", test, {data:[],errors:err}, test.expected);
if (--asyncRemaining === 0) {
asyncDone();
}
@ -308,7 +308,7 @@ function runUnparseTests() @@ -308,7 +308,7 @@ function runUnparseTests()
// and renders results in the table.
function runCustomTests(asyncDone)
{
var asyncRemaining = CUSTOM_TESTS.length;
var asyncRemaining = 0;
for (var i = 0; i < CUSTOM_TESTS.length; i++)
{
runTest(CUSTOM_TESTS[i]);
@ -316,6 +316,9 @@ function runCustomTests(asyncDone) @@ -316,6 +316,9 @@ function runCustomTests(asyncDone)
function runTest(test)
{
if (test.disabled)
return;
asyncRemaining++;
try
{
test.run(function(actual) {
@ -391,6 +394,9 @@ function passOrFailTd(result) @@ -391,6 +394,9 @@ function passOrFailTd(result)
// Reveals some hidden, whitespace, or invisible characters
function revealChars(txt)
{
if (typeof txt != 'string')
return '(file)';
// Make spaces and tabs more obvious when glancing
txt = txt.replace(/( |\t)/ig, '<span class="whitespace-char">$1</span>');
txt = txt.replace(/(\r\n|\n\r|\r|\n)/ig, '<span class="whitespace-char special-char">$1</span>$1');

Loading…
Cancel
Save