diff --git a/demo.html b/demo.html index 6c9389a..2d5106c 100644 --- a/demo.html +++ b/demo.html @@ -86,7 +86,7 @@ diff --git a/docs.html b/docs.html index 5285801..bbae6af 100644 --- a/docs.html +++ b/docs.html @@ -344,7 +344,7 @@ var csv = Papa.unparse({ download: false, skipEmptyLines: false, chunk: undefined, - fastMode: false + fastMode: undefined }
@@ -388,7 +388,7 @@ var csv = Papa.unparse({ dynamicTyping - If true, numeric and boolean data will be converted to their type instead of remaining strings. + If true, numeric and boolean data will be converted to their type instead of remaining strings. Numeric data must conform to the definition of a decimal literal. (European-formatted numbers must have commas and dots swapped.) @@ -429,11 +429,11 @@ var csv = Papa.unparse({ To stream the input, define a callback function: -
step: function(results, handle) {
+									
step: function(results, parser) {
 	console.log("Row data:", results.data);
 	console.log("Row errors:", results.errors);
 }
- Streaming is necessary for large files which would otherwise crash the browser. Except when using a Web Worker, you can call handle.abort() to stop parsing, handle.pause() to pause it, or handle.resume() to resume. + Streaming is necessary for large files which would otherwise crash the browser. You can call parser.abort() to abort parsing. And, except when using a Web Worker, you can call parser.pause() to pause it, and parser.resume() to resume. @@ -486,7 +486,7 @@ var csv = Papa.unparse({ fastMode - When enabled, fast mode executes parsing much more quickly. However, this only works for input without quoted fields. If you cannot guarantee that, do not enable fast mode. + Fast mode speeds up parsing significantly for large inputs. However, it only works when the input has no quoted fields. Fast mode will automatically be enabled if no " characters appear in the input. You can force fast mode either way by setting it to true or false. @@ -693,7 +693,7 @@ var csv = Papa.unparse({ Papa.SCRIPT_PATH - The relative path to Papa Parse. This is automatically detected when Papa Parse is loaded synchronously. However, if you load Papa Parse asynchronously (e.g. with RequireJS), you need to set this variable manually in order to use Web Workers. (In those cases, this variable is not read-only.) + The relative path to Papa Parse. This is automatically detected when Papa Parse is loaded synchronously. However, if you load Papa Parse asynchronously (e.g. with RequireJS), you need to set this variable manually in order to use Web Workers. (In those cases, this variable is not read-only and you should set it!) @@ -726,48 +726,12 @@ var csv = Papa.unparse({ Papa.DefaultDelimiter - The delimiter used when one is not specified and it cannot be detected automatically. Default is comma. + The delimiter used when it is left unspecified and cannot be detected automatically. Default is comma. -
-
For Internal Use Only
-
- -
- - - - - - - - - - - - - - - - - - - - - -
Internal PropertyExplanation
Papa.Parser - The core parsing component. Careful, it's fast and under rigorous test. -
Papa.ParserHandle - A wrapper over the Parser which provides dynamic typing and header row support. -
Papa.NetworkStreamer - Facilitates downloading and parsing files in chunks over the network with XMLHttpRequest. -
Papa.FileStreamer - Similar to NetworkStreamer, but for local files, and using the HTML5 FileReader. -
-
diff --git a/faq.html b/faq.html index 2e99e2f..6b0fc9e 100644 --- a/faq.html +++ b/faq.html @@ -100,7 +100,6 @@

-
Is it open source? (Can I contribute something?)

Yes, please! I don't want to do this all by myself. Head over to the GitHub project page and hack away. If you're making a significant change, open an issue first so we can talk about it. @@ -108,9 +107,9 @@ -

Why wouldn't I always enable fast mode?
+
What's the deal with fast mode?

- Fast mode makes Papa Parse screaming fast, but you wouldn't want to use it if there are (or may be) quoted fields in your input. Fast mode is fast because it makes one major assumption: no quoted fields. But if you know that your input has no quotes, turn that sucker on. With fast mode on, 1 GB files can be parsed in about 20 seconds. + Fast mode makes Papa Parse screaming fast, but you wouldn't want to use it if there are quoted fields in your input. Fast mode is fast because it makes one major assumption: no quoted fields. If you don't specify fastMode either way, fast mode will be turned on automatically if there are no quote characters in the input. With fast mode on, 1 GB files can be parsed in about 20 seconds.

@@ -126,7 +125,7 @@
Can Papa load and parse huge files?

- Yes. Parsing huge text files is facilitated by streaming, where the file is loaded a little bit at a time, parsed, and the results are sent to your step callback function, row-by-row. + Yes. Parsing huge text files is facilitated by streaming, where the file is loaded a little bit at a time, parsed, and the results are sent to your step callback function, row-by-row. You can also get results chunk-by-chunk (which is usually faster) by using the chunk callback function in the same way.

How do I stream my input?
@@ -134,6 +133,11 @@ Just specify a step callback function. Results will not be available after parsing is finished, however. You have to inspect the results one row at a time.

+
What if I want more than 1 row at a time?
+

+ Use the chunk callback instead. It works just like step, but you get an entire chunk of the file at a time, rather than a single row. Don't try to use step and chunk together (the behavior is undefined). +

+
What is a stream and when should I stream files?

A stream is a unique data structure which, given infinite time, gives you infinite space. @@ -172,7 +176,7 @@

Can I pause and resume parsing?

- Yes, as long as you are streaming and not using a worker. Your step callback is passed a ParserHandle which has pause, resume, and abort functions. + Yes, as long as you are streaming and not using a worker. Your step callback (same with the chunk callback) is passed a parser which has pause, resume, and abort functions. This is exceptionally useful when performing asynchronous actions during parsing, for example, AJAX requests. You can always abort parsing in your callback, even when using workers, but pause and resume is only available without a worker.

@@ -217,6 +221,11 @@

Yup. If the input is too large to fit in memory (or large enough to crash the browser), streaming is always the answer, even in a worker thread. Workers keep the page reactive. Streaming makes it able to fit in memory. Use both if you need to.

+ +
Can I pause/resume workers?
+

+ No. This would drastically slow down parsing, as it would require the worker to wait after every chunk for a "continue" signal from the main thread. But you can abort workers by calling .abort() on the parser that gets passed to your callback function. +

diff --git a/index.html b/index.html index 4955bed..e647bc5 100644 --- a/index.html +++ b/index.html @@ -26,7 +26,7 @@

Papa Parse

The powerful, in-browser CSV parser for big boys and girls

- +   Download @@ -99,7 +99,7 @@ Papa.parse(bigFile, {
-
Version
4.0
+
Version
4.1
diff --git a/resources/js/papaparse.js b/resources/js/papaparse.js index a524fc0..e445f78 100644 --- a/resources/js/papaparse.js +++ b/resources/js/papaparse.js @@ -1,34 +1,15 @@ -/* +/*! Papa Parse - v4.0.7 + v4.1.0 https://github.com/mholt/PapaParse */ (function(global) { "use strict"; - var IS_WORKER = !global.document, SCRIPT_PATH; + var IS_WORKER = !global.document, LOADED_SYNC = false, AUTO_SCRIPT_PATH; var workers = {}, workerIdCounter = 0; - // A configuration object from which to draw default settings - var DEFAULTS = { - delimiter: "", // empty: auto-detect - newline: "", // empty: auto-detect - header: false, - dynamicTyping: false, - preview: 0, - step: undefined, - encoding: "", // browser should default to "UTF-8" - worker: false, - comments: false, - complete: undefined, - error: undefined, - download: false, - chunk: undefined, - skipEmptyLines: false, - fastMode: false - }; - global.Papa = {}; global.Papa.parse = CsvToJson; @@ -39,6 +20,7 @@ global.Papa.BYTE_ORDER_MARK = "\ufeff"; global.Papa.BAD_DELIMITERS = ["\r", "\n", "\"", global.Papa.BYTE_ORDER_MARK]; global.Papa.WORKERS_SUPPORTED = !!global.Worker; + global.Papa.SCRIPT_PATH = null; // Must be set manually if using workers and Papa Parse is loaded asynchronously // Configurable chunk sizes for local and remote files, respectively global.Papa.LocalChunkSize = 1024 * 1024 * 10; // 10 MB @@ -50,6 +32,7 @@ global.Papa.ParserHandle = ParserHandle; global.Papa.NetworkStreamer = NetworkStreamer; global.Papa.FileStreamer = FileStreamer; + global.Papa.StringStreamer = StringStreamer; if (global.jQuery) { @@ -147,90 +130,70 @@ if (IS_WORKER) + { global.onmessage = workerThreadReceivedMessage; + } else if (Papa.WORKERS_SUPPORTED) - SCRIPT_PATH = getScriptPath(); + { + AUTO_SCRIPT_PATH = getScriptPath(); + + // Check if the script was loaded synchronously + if (!document.body) + { + // Body doesn't exist yet, must be synchronous + LOADED_SYNC = true; + } + else + { + document.addEventListener('DOMContentLoaded', function () { + LOADED_SYNC = true; + }, true); + } + } function CsvToJson(_input, _config) { - var config = IS_WORKER ? _config : copyAndValidateConfig(_config); - var useWorker = config.worker && Papa.WORKERS_SUPPORTED && SCRIPT_PATH; + _config = _config || {}; - if (useWorker) + if (_config.worker && Papa.WORKERS_SUPPORTED) { var w = newWorker(); - w.userStep = config.step; - w.userChunk = config.chunk; - w.userComplete = config.complete; - w.userError = config.error; + w.userStep = _config.step; + w.userChunk = _config.chunk; + w.userComplete = _config.complete; + w.userError = _config.error; - config.step = isFunction(config.step); - config.chunk = isFunction(config.chunk); - config.complete = isFunction(config.complete); - config.error = isFunction(config.error); - delete config.worker; // prevent infinite loop + _config.step = isFunction(_config.step); + _config.chunk = isFunction(_config.chunk); + _config.complete = isFunction(_config.complete); + _config.error = isFunction(_config.error); + delete _config.worker; // prevent infinite loop w.postMessage({ input: _input, - config: config, + config: _config, workerId: w.id }); + + return; } - else - { - if (typeof _input === 'string') - { - if (config.download) - { - var streamer = new NetworkStreamer(config); - streamer.stream(_input); - } - else - { - var ph = new ParserHandle(config); - var results = ph.parse(_input); - return results; - } - } - else if ((global.File && _input instanceof File) || _input instanceof Object) // ...Safari. (see issue #106) - { - if (config.step || config.chunk) - { - var streamer = new FileStreamer(config); - streamer.stream(_input); - } - else - { - var ph = new ParserHandle(config); - if (IS_WORKER) - { - var reader = new FileReaderSync(); - var input = reader.readAsText(_input, config.encoding); - return ph.parse(input); - } - else - { - reader = new FileReader(); - reader.onload = function(event) - { - var ph = new ParserHandle(config); - var results = ph.parse(event.target.result); - }; - reader.onerror = function() - { - if (isFunction(config.error)) - config.error(reader.error, _input); - }; - reader.readAsText(_input, config.encoding); - } - } - } + var streamer = null; + if (typeof _input === 'string') + { + if (_config.download) + streamer = new NetworkStreamer(_config); + else + streamer = new StringStreamer(_config); } + else if ((global.File && _input instanceof File) || _input instanceof Object) // ...Safari. (see issue #106) + streamer = new FileStreamer(_config); + + return streamer.stream(_input); } @@ -388,72 +351,131 @@ } } + // ChunkStreamer is the base prototype for various streamer implementations. + function ChunkStreamer(config) + { + this._handle = null; + this._paused = false; + this._finished = false; + this._input = null; + this._baseIndex = 0; + this._partialLine = ""; + this._rowCount = 0; + this._start = 0; + this._nextChunk = null; + replaceConfig.call(this, config); + + this.parseChunk = function(chunk) + { + // Rejoin the line we likely just split in two by chunking the file + var aggregate = this._partialLine + chunk; + this._partialLine = ""; + var results = this._handle.parse(aggregate, this._baseIndex, !this._finished); + + if (this._handle.paused()) + return; + + var lastIndex = results.meta.cursor; + + if (!this._finished) + { + this._partialLine = aggregate.substring(lastIndex - this._baseIndex); + this._baseIndex = lastIndex; + } - // TODO: Many of the functions of NetworkStreamer and FileStreamer are similar or the same. Consolidate? - function NetworkStreamer(config) - { - config = config || {}; - if (!config.chunkSize) - config.chunkSize = Papa.RemoteChunkSize; + if (results && results.data) + this._rowCount += results.data.length; - var start = 0, fileSize = 0, rowCount = 0; - var aggregate = ""; - var partialLine = ""; - var xhr, url, nextChunk, finishedWithEntireFile; - var userComplete, handle, configCopy; - replaceConfig(config); + var finishedIncludingPreview = this._finished || (this._config.preview && this._rowCount >= this._config.preview); - this.resume = function() - { - paused = false; - nextChunk(); - }; + if (IS_WORKER) + { + global.postMessage({ + results: results, + workerId: Papa.WORKER_ID, + finished: finishedIncludingPreview + }); + } + else if (isFunction(this._config.chunk)) + { + this._config.chunk(results, this._handle); + if (this._paused) + return; + results = undefined; + } - this.finished = function() - { - return finishedWithEntireFile; + if (finishedIncludingPreview && isFunction(this._config.complete) && (!results || !results.meta.aborted)) + this._config.complete(results); + + if (!finishedIncludingPreview && (!results || !results.meta.paused)) + this._nextChunk(); + + return results; }; - this.pause = function() + this._sendError = function(error) { - paused = true; + if (isFunction(this._config.error)) + this._config.error(error); + else if (IS_WORKER && this._config.error) + { + global.postMessage({ + workerId: Papa.WORKER_ID, + error: error, + finished: false + }); + } }; - this.abort = function() + function replaceConfig(config) { - finishedWithEntireFile = true; - if (isFunction(userComplete)) - userComplete({ data: [], errors: [], meta: { aborted: true } }); - }; + // Deep-copy the config so we can edit it + var configCopy = copy(config); + configCopy.chunkSize = parseInt(configCopy.chunkSize); // VERY important so we don't concatenate strings! + this._handle = new ParserHandle(configCopy); + this._handle.streamer = this; + this._config = configCopy; // persist the copy to the caller + } + } + - this.stream = function(u) + function NetworkStreamer(config) + { + config = config || {}; + if (!config.chunkSize) + config.chunkSize = Papa.RemoteChunkSize; + ChunkStreamer.call(this, config); + + var xhr; + + if (IS_WORKER) { - url = u; - if (IS_WORKER) + this._nextChunk = function() { - nextChunk = function() - { - readChunk(); - chunkLoaded(); - }; - } - else + this._readChunk(); + this._chunkLoaded(); + }; + } + else + { + this._nextChunk = function() { - nextChunk = function() - { - readChunk(); - }; - } + this._readChunk(); + }; + } - nextChunk(); // Starts streaming + this.stream = function(url) + { + this._input = url; + this._nextChunk(); // Starts streaming }; - function readChunk() + this._readChunk = function() { - if (finishedWithEntireFile) + if (this._finished) { - chunkLoaded(); + this._chunkLoaded(); return; } @@ -461,112 +483,51 @@ if (!IS_WORKER) { - xhr.onload = chunkLoaded; - xhr.onerror = chunkError; + xhr.onload = bindFunction(this._chunkLoaded, this); + xhr.onerror = bindFunction(this._chunkError, this); } - xhr.open("GET", url, !IS_WORKER); + xhr.open("GET", this._input, !IS_WORKER); - if (config.step || config.chunk) + if (this._config.step || this._config.chunk) { - var end = start + configCopy.chunkSize - 1; // minus one because byte range is inclusive - if (fileSize && end > fileSize) // Hack around a Chrome bug: http://stackoverflow.com/q/24745095/1048862 - end = fileSize; - xhr.setRequestHeader("Range", "bytes="+start+"-"+end); + var end = this._start + this._config.chunkSize - 1; // minus one because byte range is inclusive + xhr.setRequestHeader("Range", "bytes="+this._start+"-"+end); + xhr.setRequestHeader("If-None-Match", "webkit-no-cache"); // https://bugs.webkit.org/show_bug.cgi?id=82672 } try { xhr.send(); } catch (err) { - chunkError(err.message); + this._chunkError(err.message); } if (IS_WORKER && xhr.status == 0) - chunkError(); + this._chunkError(); else - start += configCopy.chunkSize; + this._start += this._config.chunkSize; } - function chunkLoaded() + this._chunkLoaded = function() { if (xhr.readyState != 4) return; if (xhr.status < 200 || xhr.status >= 400) { - chunkError(); + this._chunkError(); return; } - // Rejoin the line we likely just split in two by chunking the file - aggregate += partialLine + xhr.responseText; - partialLine = ""; - - finishedWithEntireFile = (!config.step && !config.chunk) || start > getFileSize(xhr); - - if (!finishedWithEntireFile) - { - var lastLineEnd = aggregate.lastIndexOf("\r"); - - if (lastLineEnd == -1) - lastLineEnd = aggregate.lastIndexOf("\n"); - - if (lastLineEnd != -1) - { - partialLine = aggregate.substring(lastLineEnd + 1); // skip the line ending character - aggregate = aggregate.substring(0, lastLineEnd); - } - else - { - // For chunk sizes smaller than a line (a line could not fit in a single chunk) - // we simply build our aggregate by reading in the next chunk, until we find a newline - nextChunk(); - return; - } - } - - var results = handle.parse(aggregate); - aggregate = ""; - if (results && results.data) - rowCount += results.data.length; - - var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview); - - if (IS_WORKER) - { - global.postMessage({ - results: results, - workerId: Papa.WORKER_ID, - finished: finishedIncludingPreview - }); - } - else if (isFunction(config.chunk)) - { - config.chunk(results, handle); - results = undefined; - } - - if (isFunction(userComplete) && finishedIncludingPreview) - userComplete(results); - - if (!finishedIncludingPreview && (!results || !results.meta.paused)) - nextChunk(); + this._finished = (!this._config.step && !this._config.chunk) || this._start > getFileSize(xhr); + this.parseChunk(xhr.responseText); } - function chunkError(errorMessage) + this._chunkError = function(errorMessage) { var errorText = xhr.statusText || errorMessage; - if (isFunction(config.error)) - config.error(errorText); - else if (IS_WORKER && config.error) - { - global.postMessage({ - workerId: Papa.WORKER_ID, - error: errorText, - finished: false - }); - } + this._sendError(errorText); } function getFileSize(xhr) @@ -574,28 +535,9 @@ var contentRange = xhr.getResponseHeader("Content-Range"); return parseInt(contentRange.substr(contentRange.lastIndexOf("/") + 1)); } - - function replaceConfig(config) - { - // Deep-copy the config so we can edit it; we need - // to call the complete function if we are to ensure - // that the last chunk callback, if any, will be called - // BEFORE the complete function. - configCopy = copy(config); - userComplete = configCopy.complete; - configCopy.complete = undefined; - configCopy.chunkSize = parseInt(configCopy.chunkSize); // VERY important so we don't concatenate strings! - handle = new ParserHandle(configCopy); - handle.streamer = this; - } } - - - - - - - + NetworkStreamer.prototype = Object.create(ChunkStreamer.prototype); + NetworkStreamer.prototype.constructor = NetworkStreamer; function FileStreamer(config) @@ -603,171 +545,88 @@ config = config || {}; if (!config.chunkSize) config.chunkSize = Papa.LocalChunkSize; + ChunkStreamer.call(this, config); - var start = 0; - var file; - var slice; - var aggregate = ""; - var partialLine = ""; - var rowCount = 0; - var paused = false; - var self = this; - var reader, nextChunk, slice, finishedWithEntireFile; - var userComplete, handle, configCopy; - replaceConfig(config); + var reader, slice; // FileReader is better than FileReaderSync (even in worker) - see http://stackoverflow.com/q/24708649/1048862 // But Firefox is a pill, too - see issue #76: https://github.com/mholt/PapaParse/issues/76 var usingAsyncReader = typeof FileReader !== 'undefined'; // Safari doesn't consider it a function - see issue #105 - this.stream = function(f) + this.stream = function(file) { - file = f; + this._input = file; slice = file.slice || file.webkitSlice || file.mozSlice; if (usingAsyncReader) { reader = new FileReader(); // Preferred method of reading files, even in workers - reader.onload = chunkLoaded; - reader.onerror = chunkError; + reader.onload = bindFunction(this._chunkLoaded, this); + reader.onerror = bindFunction(this._chunkError, this); } else reader = new FileReaderSync(); // Hack for running in a web worker in Firefox - nextChunk(); // Starts streaming + this._nextChunk(); // Starts streaming }; - this.finished = function() + this._nextChunk = function() { - return finishedWithEntireFile; - }; - - this.pause = function() - { - paused = true; - }; - - this.resume = function() - { - paused = false; - nextChunk(); - }; - - this.abort = function() - { - finishedWithEntireFile = true; - if (isFunction(userComplete)) - userComplete({ data: [], errors: [], meta: { aborted: true } }); - }; - - function nextChunk() - { - if (!finishedWithEntireFile && (!configCopy.preview || rowCount < configCopy.preview)) - readChunk(); + if (!this._finished && (!this._config.preview || this._rowCount < this._config.preview)) + this._readChunk(); } - function readChunk() + this._readChunk = function() { - var end = Math.min(start + configCopy.chunkSize, file.size); - var txt = reader.readAsText(slice.call(file, start, end), config.encoding); + var end = Math.min(this._start + this._config.chunkSize, this._input.size); + var txt = reader.readAsText(slice.call(this._input, this._start, end), this._config.encoding); if (!usingAsyncReader) - chunkLoaded({ target: { result: txt } }); // mimic the async signature + this._chunkLoaded({ target: { result: txt } }); // mimic the async signature } - function chunkLoaded(event) + this._chunkLoaded = function(event) { // Very important to increment start each time before handling results - start += configCopy.chunkSize; - - // Rejoin the line we likely just split in two by chunking the file - aggregate += partialLine + event.target.result; - partialLine = ""; - - finishedWithEntireFile = start >= file.size; - - if (!finishedWithEntireFile) - { - var lastLineEnd = aggregate.lastIndexOf("\r"); // TODO: Use an auto-detected line ending? - - if (lastLineEnd == -1) - lastLineEnd = aggregate.lastIndexOf("\n"); - - if (lastLineEnd != -1) - { - partialLine = aggregate.substring(lastLineEnd + 1); // skip the line ending character (TODO: Not always length 1? \r\n...) - aggregate = aggregate.substring(0, lastLineEnd); - } - else - { - // For chunk sizes smaller than a line (a line could not fit in a single chunk) - // we simply build our aggregate by reading in the next chunk, until we find a newline - nextChunk(); - return; - } - } - - var results = handle.parse(aggregate); - aggregate = ""; - if (results && results.data) - rowCount += results.data.length; + this._start += this._config.chunkSize; + this._finished = this._start >= this._input.size; + this.parseChunk(event.target.result); + } - var finishedIncludingPreview = finishedWithEntireFile || (configCopy.preview && rowCount >= configCopy.preview); + this._chunkError = function() + { + this._sendError(reader.error); + } - if (IS_WORKER) - { - global.postMessage({ - results: results, - workerId: Papa.WORKER_ID, - finished: finishedIncludingPreview - }); - } - else if (isFunction(config.chunk)) - { - config.chunk(results, self, file); - if (paused) - return; - results = undefined; - } + } + FileStreamer.prototype = Object.create(ChunkStreamer.prototype); + FileStreamer.prototype.constructor = FileStreamer; - if (isFunction(userComplete) && finishedIncludingPreview) - userComplete(results); - if (!finishedIncludingPreview && (!results || !results.meta.paused)) - nextChunk(); - } + function StringStreamer(config) + { + config = config || {}; + ChunkStreamer.call(this, config); - function chunkError() + var string; + var remaining; + this.stream = function(s) { - if (isFunction(config.error)) - config.error(reader.error, file); - else if (IS_WORKER && config.error) - { - global.postMessage({ - workerId: Papa.WORKER_ID, - error: reader.error, - file: file, - finished: false - }); - } + string = s; + remaining = s; + return this._nextChunk(); } - - function replaceConfig(config) + this._nextChunk = function() { - // Deep-copy the config so we can edit it; we need - // to call the complete function if we are to ensure - // that the last chunk callback, if any, will be called - // BEFORE the complete function. - configCopy = copy(config); - userComplete = configCopy.complete; - configCopy.complete = undefined; - configCopy.chunkSize = parseInt(configCopy.chunkSize); // VERY important so we don't concatenate strings! - handle = new ParserHandle(configCopy); - handle.streamer = this; + if (this._finished) return; + var size = this._config.chunkSize; + var chunk = size ? remaining.substr(0, size) : remaining; + remaining = size ? remaining.substr(size) : ''; + this._finished = !remaining; + return this.parseChunk(chunk); } - } - - + StringStreamer.prototype = Object.create(StringStreamer.prototype); + StringStreamer.prototype.constructor = StringStreamer; @@ -816,7 +675,10 @@ }; } - this.parse = function(input) + // Parses input. Most users won't need, and shouldn't mess with, the baseIndex + // and ignoreLastRow parameters. They are used by streamers (wrapper functions) + // when an input comes in multiple chunks, like from a file. + this.parse = function(input, baseIndex, ignoreLastRow) { if (!_config.newline) _config.newline = guessLineEndings(input); @@ -841,13 +703,16 @@ _input = input; _parser = new Parser(parserConfig); - _results = _parser.parse(_input); + _results = _parser.parse(_input, baseIndex, ignoreLastRow); processResults(); - if (isFunction(_config.complete) && !_paused && (!self.streamer || self.streamer.finished())) - _config.complete(_results); return _paused ? { meta: { paused: true } } : (_results || { meta: { paused: false } }); }; + this.paused = function() + { + return _paused; + }; + this.pause = function() { _paused = true; @@ -858,15 +723,7 @@ this.resume = function() { _paused = false; - _parser = new Parser(_config); - _parser.parse(_input); - if (!_paused) - { - if (self.streamer && !self.streamer.finished()) - self.streamer.resume(); // more of the file yet to come - else if (isFunction(_config.complete)) - _config.complete(_results); - } + self.streamer.parseChunk(_input); }; this.abort = function() @@ -1089,13 +946,13 @@ var cursor = 0; var aborted = false; - this.parse = function(input) + this.parse = function(input, baseIndex, ignoreLastRow) { // For some reason, in Chrome, this speeds things up (!?) if (typeof input !== 'string') throw "Input must be a string"; - // We don't need to compute these every time parse() is called, + // We don't need to compute some of these every time parse() is called, // but having them in a more local scope seems to perform better var inputLen = input.length, delimLen = delim.length, @@ -1105,28 +962,34 @@ // Establish starting state cursor = 0; - var data = [], errors = [], row = []; + var data = [], errors = [], row = [], lastCursor = 0; if (!input) return returnable(); - if (fastMode) + if (fastMode || (fastMode !== false && input.indexOf('"') === -1)) { - // Fast mode assumes there are no quoted fields in the input var rows = input.split(newline); for (var i = 0; i < rows.length; i++) { - if (comments && rows[i].substr(0, commentsLen) == comments) + var row = rows[i]; + cursor += row.length; + if (i !== rows.length - 1) + cursor += newline.length; + else if (ignoreLastRow) + return returnable(); + if (comments && row.substr(0, commentsLen) == comments) continue; if (stepIsFunction) { - data = [ rows[i].split(delim) ]; + data = []; + pushRow(row.split(delim)); doStep(); if (aborted) return returnable(); } else - data.push(rows[i].split(delim)); + pushRow(row.split(delim)); if (preview && i >= preview) { data = data.slice(0, preview); @@ -1156,27 +1019,26 @@ // Find closing quote var quoteSearch = input.indexOf('"', quoteSearch+1); - if (quoteSearch == -1) + if (quoteSearch === -1) { - // No closing quote... what a pity - errors.push({ - type: "Quotes", - code: "MissingQuotes", - message: "Quoted field unterminated", - row: data.length, // row has yet to be inserted - index: cursor - }); + if (!ignoreLastRow) { + // No closing quote... what a pity + errors.push({ + type: "Quotes", + code: "MissingQuotes", + message: "Quoted field unterminated", + row: data.length, // row has yet to be inserted + index: cursor + }); + } return finish(); } - if (quoteSearch == inputLen-1) + if (quoteSearch === inputLen-1) { // Closing quote at EOF - row.push(input.substring(cursor, quoteSearch).replace(/""/g, '"')); - data.push(row); - if (stepIsFunction) - doStep(); - return returnable(); + var value = input.substring(cursor, quoteSearch).replace(/""/g, '"'); + return finish(value); } // If this quote is escaped, it's part of the data; skip it @@ -1196,7 +1058,7 @@ break; } - if (input.substr(quoteSearch+1, newlineLen) == newline) + if (input.substr(quoteSearch+1, newlineLen) === newline) { // Closing quote followed by newline row.push(input.substring(cursor, quoteSearch).replace(/""/g, '"')); @@ -1221,7 +1083,7 @@ } // Comment found at start of new line - if (comments && row.length == 0 && input.substr(cursor, commentsLen) == comments) + if (comments && row.length === 0 && input.substr(cursor, commentsLen) === comments) { if (nextNewline == -1) // Comment ends at EOF return returnable(); @@ -1232,7 +1094,7 @@ } // Next delimiter comes before next newline, so we've reached end of field - if (nextDelim != -1 && (nextDelim < nextNewline || nextNewline == -1)) + if (nextDelim !== -1 && (nextDelim < nextNewline || nextNewline === -1)) { row.push(input.substring(cursor, nextDelim)); cursor = nextDelim + delimLen; @@ -1241,7 +1103,7 @@ } // End of row - if (nextNewline != -1) + if (nextNewline !== -1) { row.push(input.substring(cursor, nextNewline)); saveRow(nextNewline + newlineLen); @@ -1266,13 +1128,23 @@ return finish(); + function pushRow(row) + { + data.push(row); + lastCursor = cursor; + } + // Appends the remaining input from cursor to the end into // row, saves the row, calls step, and returns the results. - function finish() + function finish(value) { - row.push(input.substr(cursor)); - data.push(row); + if (ignoreLastRow) + return returnable(); + if (!value) + value = input.substr(cursor); + row.push(value); cursor = inputLen; // important in case parsing is paused + pushRow(row); if (stepIsFunction) doStep(); return returnable(); @@ -1284,9 +1156,9 @@ // preview and end parsing if necessary. function saveRow(newCursor) { - data.push(row); - row = []; cursor = newCursor; + pushRow(row); + row = []; nextNewline = input.indexOf(newline, cursor); } @@ -1300,7 +1172,8 @@ delimiter: delim, linebreak: newline, aborted: aborted, - truncated: !!stopped + truncated: !!stopped, + cursor: lastCursor + (baseIndex || 0) } }; } @@ -1331,16 +1204,20 @@ // the script path here. See: https://github.com/mholt/PapaParse/issues/87#issuecomment-57885358 function getScriptPath() { - var id = "worker" + String(Math.random()).substr(2); - document.write(''); - return document.getElementById(id).previousSibling.src; + var scripts = document.getElementsByTagName('script'); + return scripts.length ? scripts[scripts.length - 1].src : ''; } function newWorker() { if (!Papa.WORKERS_SUPPORTED) return false; - var w = new global.Worker(SCRIPT_PATH); + if (!LOADED_SYNC && Papa.SCRIPT_PATH === null) + throw new Error( + 'Script path cannot be determined automatically when Papa Parse is loaded asynchronously. ' + + 'You need to set Papa.SCRIPT_PATH manually.' + ); + var w = new global.Worker(Papa.SCRIPT_PATH || AUTO_SCRIPT_PATH); w.onmessage = mainThreadReceivedMessage; w.id = workerIdCounter++; workers[w.id] = w; @@ -1352,11 +1229,23 @@ { var msg = e.data; var worker = workers[msg.workerId]; + var aborted = false; if (msg.error) worker.userError(msg.error, msg.file); else if (msg.results && msg.results.data) { + var abort = function() { + aborted = true; + completeWorker(msg.workerId, { data: [], errors: [], meta: { aborted: true } }); + }; + + var handle = { + abort: abort, + pause: notImplemented, + resume: notImplemented + }; + if (isFunction(worker.userStep)) { for (var i = 0; i < msg.results.data.length; i++) @@ -1365,24 +1254,33 @@ data: [msg.results.data[i]], errors: msg.results.errors, meta: msg.results.meta - }); + }, handle); + if (aborted) + break; } delete msg.results; // free memory ASAP } else if (isFunction(worker.userChunk)) { - worker.userChunk(msg.results, msg.file); + worker.userChunk(msg.results, handle, msg.file); delete msg.results; } } - if (msg.finished) - { - if (isFunction(workers[msg.workerId].userComplete)) - workers[msg.workerId].userComplete(msg.results); - workers[msg.workerId].terminate(); - delete workers[msg.workerId]; - } + if (msg.finished && !aborted) + completeWorker(msg.workerId, msg.results); + } + + function completeWorker(workerId, results) { + var worker = workers[workerId]; + if (isFunction(worker.userComplete)) + worker.userComplete(results); + worker.terminate(); + delete workers[workerId]; + } + + function notImplemented() { + throw "Not implemented."; } // Callback when worker thread receives a message @@ -1413,60 +1311,7 @@ } } - // Replaces bad config values with good, default ones - function copyAndValidateConfig(origConfig) - { - if (typeof origConfig !== 'object') - origConfig = {}; - - var config = copy(origConfig); - - if (typeof config.delimiter !== 'string' - || config.delimiter.length != 1 - || Papa.BAD_DELIMITERS.indexOf(config.delimiter) > -1) - config.delimiter = DEFAULTS.delimiter; - - if (config.newline != '\n' - && config.newline != '\r' - && config.newline != '\r\n') - config.newline = DEFAULTS.newline; - - if (typeof config.header !== 'boolean') - config.header = DEFAULTS.header; - - if (typeof config.dynamicTyping !== 'boolean') - config.dynamicTyping = DEFAULTS.dynamicTyping; - - if (typeof config.preview !== 'number') - config.preview = DEFAULTS.preview; - - if (typeof config.step !== 'function') - config.step = DEFAULTS.step; - - if (typeof config.complete !== 'function') - config.complete = DEFAULTS.complete; - - if (typeof config.error !== 'function') - config.error = DEFAULTS.error; - - if (typeof config.encoding !== 'string') - config.encoding = DEFAULTS.encoding; - - if (typeof config.worker !== 'boolean') - config.worker = DEFAULTS.worker; - - if (typeof config.download !== 'boolean') - config.download = DEFAULTS.download; - - if (typeof config.skipEmptyLines !== 'boolean') - config.skipEmptyLines = DEFAULTS.skipEmptyLines; - - if (typeof config.fastMode !== 'boolean') - config.fastMode = DEFAULTS.fastMode; - - return config; - } - + // Makes a deep copy of an array or object (mostly) function copy(obj) { if (typeof obj !== 'object') @@ -1477,6 +1322,13 @@ return cpy; } + function bindFunction(f, self) + { + return function() { + f.apply(self, arguments); + } + } + function isFunction(func) { return typeof func === 'function';