use parser to find last row end in chunks

11 years ago · 089d188084
3 changed files with 120 additions and 75 deletions
--- a/papaparse.js
+++ b/papaparse.js
@ -522,34 +522,16 @@
 			finishedWithEntireFile = (!config.step && !config.chunk) || start > getFileSize(xhr);
-			var lastLineEnd;
+			var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile);
 			var lastIndex = results.meta.cursor;
 			if (!finishedWithEntireFile)
 			{
-				lastLineEnd = aggregate.lastIndexOf("\r");
+				partialLine = aggregate.substring(lastIndex - baseIndex);
-
+				baseIndex = lastIndex;
 				if (lastLineEnd == -1)
 					lastLineEnd = aggregate.lastIndexOf("\n");
 				if (lastLineEnd != -1)
 				{
 					partialLine = aggregate.substring(lastLineEnd + 1);	// skip the line ending character
 					aggregate = aggregate.substring(0, lastLineEnd);
 				}
 				else
 				{
 					// For chunk sizes smaller than a line (a line could not fit in a single chunk)
 					// we simply build our aggregate by reading in the next chunk, until we find a newline
 					nextChunk();
 					return;
 				}
 			}
 			var results = handle.parse(aggregate, baseIndex);
 			aggregate = "";
 			if (!finishedWithEntireFile)
 				baseIndex += lastLineEnd + 1;
 			if (results && results.data)
 				rowCount += results.data.length;
 			aggregate = "";
 			var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview);
@ -705,34 +687,16 @@
 			finishedWithEntireFile = start >= file.size;
-			var lastLineEnd;
+			var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile);
 			var lastIndex = results.meta.cursor;
 			if (!finishedWithEntireFile)
 			{
-				lastLineEnd = aggregate.lastIndexOf("\r");	// TODO: Use an auto-detected line ending?
+				partialLine = aggregate.substring(lastIndex - baseIndex);
-
+				baseIndex = lastIndex;
 				if (lastLineEnd == -1)
 					lastLineEnd = aggregate.lastIndexOf("\n");
 				if (lastLineEnd != -1)
 				{
 					partialLine = aggregate.substring(lastLineEnd + 1);	// skip the line ending character (TODO: Not always length 1? \r\n...)
 					aggregate = aggregate.substring(0, lastLineEnd);
 				}
 				else
 				{
 					// For chunk sizes smaller than a line (a line could not fit in a single chunk)
 					// we simply build our aggregate by reading in the next chunk, until we find a newline
 					nextChunk();
 					return;
 				}
 			}
 			var results = handle.parse(aggregate, baseIndex);
 			aggregate = "";
 			if (!finishedWithEntireFile)
 				baseIndex += lastLineEnd + 1;
 			if (results && results.data)
 				rowCount += results.data.length;
 			aggregate = "";
 			var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview);
@ -839,7 +803,7 @@
 			};
 		}
-		this.parse = function(input, baseIndex)
+		this.parse = function(input, baseIndex, ignoreLastRow)
 		{
 			if (!_config.newline)
 				_config.newline = guessLineEndings(input);
@ -864,7 +828,7 @@
 			_input = input;
 			_parser = new Parser(parserConfig);
-			_results = _parser.parse(_input, baseIndex);
+			_results = _parser.parse(_input, baseIndex, ignoreLastRow);
 			processResults();
 			if (isFunction(_config.complete) && !_paused && (!self.streamer || self.streamer.finished()))
 				_config.complete(_results);
@ -1112,7 +1076,7 @@
 		var cursor = 0;
 		var aborted = false;
-		this.parse = function(input, baseIndex)
+		this.parse = function(input, baseIndex, ignoreLastRow)
 		{
 			// For some reason, in Chrome, this speeds things up (!?)
 			if (typeof input !== 'string')
@ -1182,25 +1146,24 @@
 						if (quoteSearch === -1)
 						{
-							// No closing quote... what a pity
+							if (!ignoreLastRow) {
-							errors.push({
+								// No closing quote... what a pity
-								type: "Quotes",
+								errors.push({
-								code: "MissingQuotes",
+									type: "Quotes",
-								message: "Quoted field unterminated",
+									code: "MissingQuotes",
-								row: data.length,	// row has yet to be inserted
+									message: "Quoted field unterminated",
-								index: cursor
+									row: data.length,	// row has yet to be inserted
-							});
+									index: cursor
 								});
 							}
 							return finish();
 						}
 						if (quoteSearch === inputLen-1)
 						{
 							// Closing quote at EOF
-							row.push(input.substring(cursor, quoteSearch).replace(/""/g, '"'));
+							var value = input.substring(cursor, quoteSearch).replace(/""/g, '"');
-							pushRow(row);
+							return finish(value);
 							if (stepIsFunction)
 								doStep();
 							return returnable();
 						}
 						// If this quote is escaped, it's part of the data; skip it
@ -1298,9 +1261,13 @@
 			// Appends the remaining input from cursor to the end into
 			// row, saves the row, calls step, and returns the results.
-			function finish()
+			function finish(value)
 			{
-				row.push(input.substr(cursor));
+				if (ignoreLastRow)
 					return returnable();
 				if (!value)
 					value = input.substr(cursor);
 				row.push(value);
 				cursor = inputLen;	// important in case parsing is paused
 				pushRow(row);
 				if (stepIsFunction)
--- a/tests/test-cases.js
+++ b/tests/test-cases.js
@ -1,5 +1,10 @@
 var RECORD_SEP = String.fromCharCode(30);
 var UNIT_SEP = String.fromCharCode(31);
 var FILES_ENABLED = false;
 try {
  new File([""], "");
  FILES_ENABLED = true;
 } catch (e) {} // safari, ie
 // Tests for the core parser using new Papa.Parser().parse() (CSV to JSON)
 var CORE_PARSER_TESTS = [
@ -845,7 +850,28 @@ var PARSE_ASYNC_TESTS = [
 			data: [['A','B','C'],['X','Y','Z']],
 			errors: []
 		}
-	}
+	},
  {
    description: "Simple file",
    input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false,
 		config: {
 		},
 		expected: {
 			data: [['A','B','C'],['X','Y','Z']],
 			errors: []
 		}
  },
  {
    description: "Simple file + worker",
    input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false,
 		config: {
 			worker: true,
 		},
 		expected: {
 			data: [['A','B','C'],['X','Y','Z']],
 			errors: []
 		}
  }
 ];
@ -1045,7 +1071,7 @@ var CUSTOM_TESTS = [
 		}
 	},
 	{
-		description: "Step exposes cursor for files",
+		description: "Step exposes cursor for downloads",
 		expected: [129,	287, 452, 595, 727, 865, 1031, 1209],
 		run: function(callback) {
 			var updates = [];
@ -1060,9 +1086,8 @@ var CUSTOM_TESTS = [
 		}
 	},
 	{
-		description: "Step exposes cursor for chunked files",
+		description: "Step exposes cursor for chunked downloads",
-		// Tiny inconsistency: the last full row in each chunk will not see a newline.
+		expected: [129,	287, 452, 595, 727, 865, 1031, 1209],
 		expected: [129, 287, 451, 595, 727, 864, 1031, 1209],
 		run: function(callback) {
 			var updates = [];
 			Papa.parse("/tests/long-sample.csv", {
@ -1078,8 +1103,7 @@ var CUSTOM_TESTS = [
 	},
 	{
 		description: "Step exposes cursor for workers",
-		// You're only really getting chunk cursors here.
+		expected: [452, 452, 452, 865, 865, 865, 1209, 1209],
 		expected: [451, 451, 451, 864, 864, 864, 1209, 1209],
 		run: function(callback) {
 			var updates = [];
 			Papa.parse("/tests/long-sample.csv", {
@ -1112,7 +1136,7 @@ var CUSTOM_TESTS = [
 	},
 	{
 		description: "Chunk is called with cursor position",
-		expected: [451, 864, 1209],
+		expected: [452, 865, 1209],
 		run: function(callback) {
 			var updates = [];
 			Papa.parse("/tests/long-sample.csv", {
@ -1126,4 +1150,52 @@ var CUSTOM_TESTS = [
 			});
 		}
 	},
 	{
 		description: "Step exposes indexes for files",
 		expected: [6, 12, 17],
 		disabled: !FILES_ENABLED,
 		run: function(callback) {
 			var updates = [];
 			Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), {
 				download: true,
 				step: function(response) {
 					updates.push(response.indexes[0]);
 				}, complete: function() {
 					callback(updates);
 				}
 			});
 		}
 	},
 	{
 		description: "Step exposes indexes for chunked files",
 		expected: [6, 12, 17],
 		disabled: !FILES_ENABLED,
 		run: function(callback) {
 			var updates = [];
 			Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), {
 				chunkSize: 3,
 				step: function(response) {
 					updates.push(response.indexes[0]);
 				}, complete: function() {
 					callback(updates);
 				}
 			});
 		}
 	},
 	{
 		description: "Quoted line breaks near chunk boundaries are handled",
 		expected: [['A', 'B', 'C'], ['X', 'Y\n1\n2\n3', 'Z']],
 		disabled: !FILES_ENABLED,
 		run: function(callback) {
 			var updates = [];
 			Papa.parse(new File(['A,B,C\nX,"Y\n1\n2\n3",Z'], 'sample.csv'), {
 				chunkSize: 3,
 				step: function(response) {
 					updates.push(response.data[0]);
 				}, complete: function() {
 					callback(updates);
 				}
 			});
 		}
 	}
 ];
--- a/tests/test-runner.js
+++ b/tests/test-runner.js
@ -100,7 +100,7 @@ function runParseTests(asyncDone)
 		{
 			var results = compare(actual.data, actual.errors, test.expected);
-			displayResults("tests-for-parse", test, actual, results);
+			displayResults("#tests-for-parse", test, actual, results);
 			if (results.data.passed && results.errors.passed) {
 				passCount++;
@ -115,7 +115,7 @@ function runParseTests(asyncDone)
 		config.error = function(err)
 		{
 			failCount++;
-			displayResults(test, {data:[],errors:err}, test.expected);
+			displayResults("#tests-for-parse", test, {data:[],errors:err}, test.expected);
 			if (--asyncRemaining === 0) {
 				asyncDone();
 			}
@ -308,7 +308,7 @@ function runUnparseTests()
 // and renders results in the table.
 function runCustomTests(asyncDone)
 {
-	var asyncRemaining = CUSTOM_TESTS.length;
+	var asyncRemaining = 0;
 	for (var i = 0; i < CUSTOM_TESTS.length; i++)
 	{
 		runTest(CUSTOM_TESTS[i]);
@ -316,6 +316,9 @@ function runCustomTests(asyncDone)
 	function runTest(test)
 	{
 		if (test.disabled)
 			return;
 		asyncRemaining++;
 		try
 		{
 			test.run(function(actual) {
@ -391,6 +394,9 @@ function passOrFailTd(result)
 // Reveals some hidden, whitespace, or invisible characters
 function revealChars(txt)
 {
 	if (typeof txt != 'string')
 		return '(file)';
 	// Make spaces and tabs more obvious when glancing
 	txt = txt.replace(/( |\t)/ig, '<span class="whitespace-char">$1</span>');
 	txt = txt.replace(/(\r\n|\n\r|\r|\n)/ig, '<span class="whitespace-char special-char">$1</span>$1');