use parser to find last row end in chunks

10 years ago · 089d188084
3 changed files with 120 additions and 75 deletions
--- a/papaparse.js
+++ b/papaparse.js
@ -522,34 +522,16 @@
				@@ -522,34 +522,16 @@

 			finishedWithEntireFile = (!config.step && !config.chunk) || start > getFileSize(xhr);

-			var lastLineEnd;
+			var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile);
+			var lastIndex = results.meta.cursor;
 			if (!finishedWithEntireFile)
 			{
-				lastLineEnd = aggregate.lastIndexOf("\r");
-
-				if (lastLineEnd == -1)
-					lastLineEnd = aggregate.lastIndexOf("\n");
-
-				if (lastLineEnd != -1)
-				{
-					partialLine = aggregate.substring(lastLineEnd + 1);	// skip the line ending character
-					aggregate = aggregate.substring(0, lastLineEnd);
-				}
-				else
-				{
-					// For chunk sizes smaller than a line (a line could not fit in a single chunk)
-					// we simply build our aggregate by reading in the next chunk, until we find a newline
-					nextChunk();
-					return;
-				}
+				partialLine = aggregate.substring(lastIndex - baseIndex);
+				baseIndex = lastIndex;
 			}
-
-			var results = handle.parse(aggregate, baseIndex);
-			aggregate = "";
-			if (!finishedWithEntireFile)
-				baseIndex += lastLineEnd + 1;
 			if (results && results.data)
 				rowCount += results.data.length;
+			aggregate = "";

 			var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview);

@ -705,34 +687,16 @@
				@@ -705,34 +687,16 @@

 			finishedWithEntireFile = start >= file.size;

-			var lastLineEnd;
+			var results = handle.parse(aggregate, baseIndex, !finishedWithEntireFile);
+			var lastIndex = results.meta.cursor;
 			if (!finishedWithEntireFile)
 			{
-				lastLineEnd = aggregate.lastIndexOf("\r");	// TODO: Use an auto-detected line ending?
-
-				if (lastLineEnd == -1)
-					lastLineEnd = aggregate.lastIndexOf("\n");
-
-				if (lastLineEnd != -1)
-				{
-					partialLine = aggregate.substring(lastLineEnd + 1);	// skip the line ending character (TODO: Not always length 1? \r\n...)
-					aggregate = aggregate.substring(0, lastLineEnd);
+				partialLine = aggregate.substring(lastIndex - baseIndex);
+				baseIndex = lastIndex;
 			}
-				else
-				{
-					// For chunk sizes smaller than a line (a line could not fit in a single chunk)
-					// we simply build our aggregate by reading in the next chunk, until we find a newline
-					nextChunk();
-					return;
-				}
-			}
-
-			var results = handle.parse(aggregate, baseIndex);
-			aggregate = "";
-			if (!finishedWithEntireFile)
-				baseIndex += lastLineEnd + 1;
 			if (results && results.data)
 				rowCount += results.data.length;
+			aggregate = "";

 			var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview);

@ -839,7 +803,7 @@
				@@ -839,7 +803,7 @@
 			};
 		}

-		this.parse = function(input, baseIndex)
+		this.parse = function(input, baseIndex, ignoreLastRow)
 		{
 			if (!_config.newline)
 				_config.newline = guessLineEndings(input);
@ -864,7 +828,7 @@
				@@ -864,7 +828,7 @@

 			_input = input;
 			_parser = new Parser(parserConfig);
-			_results = _parser.parse(_input, baseIndex);
+			_results = _parser.parse(_input, baseIndex, ignoreLastRow);
 			processResults();
 			if (isFunction(_config.complete) && !_paused && (!self.streamer || self.streamer.finished()))
 				_config.complete(_results);
@ -1112,7 +1076,7 @@
				@@ -1112,7 +1076,7 @@
 		var cursor = 0;
 		var aborted = false;

-		this.parse = function(input, baseIndex)
+		this.parse = function(input, baseIndex, ignoreLastRow)
 		{
 			// For some reason, in Chrome, this speeds things up (!?)
 			if (typeof input !== 'string')
@ -1182,6 +1146,7 @@
				@@ -1182,6 +1146,7 @@

 						if (quoteSearch === -1)
 						{
+							if (!ignoreLastRow) {
 								// No closing quote... what a pity
 								errors.push({
 									type: "Quotes",
@ -1190,17 +1155,15 @@
				@@ -1190,17 +1155,15 @@
 									row: data.length,	// row has yet to be inserted
 									index: cursor
 								});
+							}
 							return finish();
 						}

 						if (quoteSearch === inputLen-1)
 						{
 							// Closing quote at EOF
-							row.push(input.substring(cursor, quoteSearch).replace(/""/g, '"'));
-							pushRow(row);
-							if (stepIsFunction)
-								doStep();
-							return returnable();
+							var value = input.substring(cursor, quoteSearch).replace(/""/g, '"');
+							return finish(value);
 						}

 						// If this quote is escaped, it's part of the data; skip it
@ -1298,9 +1261,13 @@
				@@ -1298,9 +1261,13 @@

 			// Appends the remaining input from cursor to the end into
 			// row, saves the row, calls step, and returns the results.
-			function finish()
+			function finish(value)
 			{
-				row.push(input.substr(cursor));
+				if (ignoreLastRow)
+					return returnable();
+				if (!value)
+					value = input.substr(cursor);
+				row.push(value);
 				cursor = inputLen;	// important in case parsing is paused
 				pushRow(row);
 				if (stepIsFunction)
--- a/tests/test-cases.js
+++ b/tests/test-cases.js
@ -1,5 +1,10 @@
				@@ -1,5 +1,10 @@
 var RECORD_SEP = String.fromCharCode(30);
 var UNIT_SEP = String.fromCharCode(31);
+var FILES_ENABLED = false;
+try {
+  new File([""], "");
+  FILES_ENABLED = true;
+} catch (e) {} // safari, ie

 // Tests for the core parser using new Papa.Parser().parse() (CSV to JSON)
 var CORE_PARSER_TESTS = [
@ -845,6 +850,27 @@ var PARSE_ASYNC_TESTS = [
				@@ -845,6 +850,27 @@ var PARSE_ASYNC_TESTS = [
 			data: [['A','B','C'],['X','Y','Z']],
 			errors: []
 		}
+	},
+  {
+    description: "Simple file",
+    input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false,
+		config: {
+		},
+		expected: {
+			data: [['A','B','C'],['X','Y','Z']],
+			errors: []
+		}
+  },
+  {
+    description: "Simple file + worker",
+    input: FILES_ENABLED ? new File(["A,B,C\nX,Y,Z"], "sample.csv") : false,
+		config: {
+			worker: true,
+		},
+		expected: {
+			data: [['A','B','C'],['X','Y','Z']],
+			errors: []
+		}
  }
 ];

@ -1045,7 +1071,7 @@ var CUSTOM_TESTS = [
				@@ -1045,7 +1071,7 @@ var CUSTOM_TESTS = [
 		}
 	},
 	{
-		description: "Step exposes cursor for files",
+		description: "Step exposes cursor for downloads",
 		expected: [129,	287, 452, 595, 727, 865, 1031, 1209],
 		run: function(callback) {
 			var updates = [];
@ -1060,9 +1086,8 @@ var CUSTOM_TESTS = [
				@@ -1060,9 +1086,8 @@ var CUSTOM_TESTS = [
 		}
 	},
 	{
-		description: "Step exposes cursor for chunked files",
-		// Tiny inconsistency: the last full row in each chunk will not see a newline.
-		expected: [129, 287, 451, 595, 727, 864, 1031, 1209],
+		description: "Step exposes cursor for chunked downloads",
+		expected: [129,	287, 452, 595, 727, 865, 1031, 1209],
 		run: function(callback) {
 			var updates = [];
 			Papa.parse("/tests/long-sample.csv", {
@ -1078,8 +1103,7 @@ var CUSTOM_TESTS = [
				@@ -1078,8 +1103,7 @@ var CUSTOM_TESTS = [
 	},
 	{
 		description: "Step exposes cursor for workers",
-		// You're only really getting chunk cursors here.
-		expected: [451, 451, 451, 864, 864, 864, 1209, 1209],
+		expected: [452, 452, 452, 865, 865, 865, 1209, 1209],
 		run: function(callback) {
 			var updates = [];
 			Papa.parse("/tests/long-sample.csv", {
@ -1112,7 +1136,7 @@ var CUSTOM_TESTS = [
				@@ -1112,7 +1136,7 @@ var CUSTOM_TESTS = [
 	},
 	{
 		description: "Chunk is called with cursor position",
-		expected: [451, 864, 1209],
+		expected: [452, 865, 1209],
 		run: function(callback) {
 			var updates = [];
 			Papa.parse("/tests/long-sample.csv", {
@ -1126,4 +1150,52 @@ var CUSTOM_TESTS = [
				@@ -1126,4 +1150,52 @@ var CUSTOM_TESTS = [
 			});
 		}
 	},
+	{
+		description: "Step exposes indexes for files",
+		expected: [6, 12, 17],
+		disabled: !FILES_ENABLED,
+		run: function(callback) {
+			var updates = [];
+			Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), {
+				download: true,
+				step: function(response) {
+					updates.push(response.indexes[0]);
+				}, complete: function() {
+					callback(updates);
+				}
+			});
+		}
+	},
+	{
+		description: "Step exposes indexes for chunked files",
+		expected: [6, 12, 17],
+		disabled: !FILES_ENABLED,
+		run: function(callback) {
+			var updates = [];
+			Papa.parse(new File(['A,b,c\nd,E,f\nG,h,i'], 'sample.csv'), {
+				chunkSize: 3,
+				step: function(response) {
+					updates.push(response.indexes[0]);
+				}, complete: function() {
+					callback(updates);
+				}
+			});
+		}
+	},
+	{
+		description: "Quoted line breaks near chunk boundaries are handled",
+		expected: [['A', 'B', 'C'], ['X', 'Y\n1\n2\n3', 'Z']],
+		disabled: !FILES_ENABLED,
+		run: function(callback) {
+			var updates = [];
+			Papa.parse(new File(['A,B,C\nX,"Y\n1\n2\n3",Z'], 'sample.csv'), {
+				chunkSize: 3,
+				step: function(response) {
+					updates.push(response.data[0]);
+				}, complete: function() {
+					callback(updates);
+				}
+			});
+		}
+	}
 ];
--- a/tests/test-runner.js
+++ b/tests/test-runner.js
@ -100,7 +100,7 @@ function runParseTests(asyncDone)
				@@ -100,7 +100,7 @@ function runParseTests(asyncDone)
 		{
 			var results = compare(actual.data, actual.errors, test.expected);

-			displayResults("tests-for-parse", test, actual, results);
+			displayResults("#tests-for-parse", test, actual, results);

 			if (results.data.passed && results.errors.passed) {
 				passCount++;
@ -115,7 +115,7 @@ function runParseTests(asyncDone)
				@@ -115,7 +115,7 @@ function runParseTests(asyncDone)
 		config.error = function(err)
 		{
 			failCount++;
-			displayResults(test, {data:[],errors:err}, test.expected);
+			displayResults("#tests-for-parse", test, {data:[],errors:err}, test.expected);
 			if (--asyncRemaining === 0) {
 				asyncDone();
 			}
@ -308,7 +308,7 @@ function runUnparseTests()
				@@ -308,7 +308,7 @@ function runUnparseTests()
 // and renders results in the table.
 function runCustomTests(asyncDone)
 {
-	var asyncRemaining = CUSTOM_TESTS.length;
+	var asyncRemaining = 0;
 	for (var i = 0; i < CUSTOM_TESTS.length; i++)
 	{
 		runTest(CUSTOM_TESTS[i]);
@ -316,6 +316,9 @@ function runCustomTests(asyncDone)
				@@ -316,6 +316,9 @@ function runCustomTests(asyncDone)

 	function runTest(test)
 	{
+		if (test.disabled)
+			return;
+		asyncRemaining++;
 		try
 		{
 			test.run(function(actual) {
@ -391,6 +394,9 @@ function passOrFailTd(result)
				@@ -391,6 +394,9 @@ function passOrFailTd(result)
 // Reveals some hidden, whitespace, or invisible characters
 function revealChars(txt)
 {
+	if (typeof txt != 'string')
+		return '(file)';
+
 	// Make spaces and tabs more obvious when glancing
 	txt = txt.replace(/( |\t)/ig, '<span class="whitespace-char">$1</span>');
 	txt = txt.replace(/(\r\n|\n\r|\r|\n)/ig, '<span class="whitespace-char special-char">$1</span>$1');