From ee1091e88f2078bd5214bd74b5e839b89248e769 Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Tue, 17 Dec 2013 13:52:25 -0700 Subject: [PATCH] New feature to auto-detect the delimiter if none is specified (closes #11) --- README.md | 20 ++++--- index.html | 12 ++++- jquery.parse.js | 125 ++++++++++++++++++++++++++++++++++---------- jquery.parse.min.js | 4 +- parse.jquery.json | 5 +- tests.js | 72 ++++++++++++++++++++++++- 6 files changed, 195 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 8fecb60..075ee9a 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,11 @@ The jQuery Parse plugin is a robust and efficient CSV (character-separated value - Parses delimited text strings without any fuss - Attach to `` elements to load and parse files from disk +- Automatically detects delimiter (or specify a delimiter yourself) - Header row support - Gracefully handles malformed data - Optional dynamic typing so that numeric data is parsed as numbers - Descriptive and contextual errors -- Custom delimiter @@ -36,11 +36,12 @@ For debug/dev: [jquery.parse.js](https://github.com/mholt/jquery.parse/blob/mast Any time you invoke the parser, you may customize it using a "config" object. It supports these properties: -| Option | Default | Description -|-------------------- | -------- | --------------- -| **`delimiter`** | `","` | The delimiting character. Must be a string with length 1. Can be any character except `\n` and `"`. -| **`header`** | `true` | If true, interpret the first row of parsed data as column titles; fields are returned separately from the data, and data will be returned keyed to its field name. Duplicate field names would be problematic. If false, the parser simply returns an array (list) of arrays (rows), including the first row. -| **`dynamicTyping`** | `true` | If true, fields that are only numeric will be converted to a number type. If false, each parsed datum is returned as a string. +| Option | Default | Description +|-------------------- | ------- | --------------- +| **`delimiter`** | `""` | The delimiting character. Leave blank to auto-detect. If you specify a delimiter, it must be a string of length 1, and cannot be `\n`, `\r`, or `"`. +| **`header`** | `true` | If true, interpret the first row of parsed data as column titles; fields are returned separately from the data, and data will be returned keyed to its field name. Duplicate field names would be problematic. If false, the parser simply returns an array (list) of arrays (rows), including the first row. +| **`dynamicTyping`** | `true` | If true, fields that are only numeric will be converted to a number type. If false, each parsed datum is returned as a string. +| **`preview`** | `0` | If preview > 0, only that many rows will be parsed. @@ -61,7 +62,8 @@ Or to customize the settings, pass in a config object with any properties you wi var results = $.parse(csvString, { delimiter: "\t", header: false, - dynamicTyping: false + dynamicTyping: false, + preview: 10 }); ``` @@ -155,6 +157,8 @@ The results will always have this basic structure: } ``` +If no delimiter is specified and a delimiter cannot be auto-detected, an error keyed by "config" will be produced, and a default delimiter will be chosen. + **Example input:** Item,SKU,Cost,Quantity @@ -399,7 +403,7 @@ The Parser component is under test. Download this repository and open `tests.htm The Parser function ------------------- -Inside this jQuery plugin is a `Parser` function that actually performs the parsing of delimited text. It does not depend upon jQuery. This plugin uses jQuery to attach to `` elements and to make it more convenient to activate the parsing mechanism. +Inside this jQuery plugin is a `Parser` function that performs the parsing of delimited text. It does not depend upon jQuery. This plugin uses jQuery to attach to `` elements and to make it more convenient to activate and use the parsing mechanism. diff --git a/index.html b/index.html index 43445ca..842f232 100644 --- a/index.html +++ b/index.html @@ -28,6 +28,10 @@ width: 80px; } + #tabdelim { + font-size: 12px; + } + .container { width: 100%; } @@ -53,7 +57,8 @@
- Delimiter: + Delimiter: (Tab) +         @@ -118,6 +123,11 @@ $(function() }); }); + $('#tabdelim').click(function() + { + $('#delim').val("\t"); + }) + function userConfig() { return { diff --git a/jquery.parse.js b/jquery.parse.js index bb2a03c..6135a0f 100644 --- a/jquery.parse.js +++ b/jquery.parse.js @@ -1,6 +1,6 @@ /* jQuery Parse Plugin - v1.0.1 + v1.1.0 https://github.com/mholt/jquery.parse */ @@ -112,6 +112,8 @@ // Parser is the actual parsing component. // It is under test and does not depend on jQuery. + // You could rip this entire function out of the plugin + // and use it independently (with attribution). function Parser(config) { var self = this; @@ -119,34 +121,16 @@ var _config = {}; var _state = emptyState(); var _defaultConfig = { - delimiter: ",", + delimiter: "", header: true, - dynamicTyping: true + dynamicTyping: true, + preview: 0 }; var _regex = { floats: /^\s*-?(\d*\.?\d+|\d+\.?\d*)(e[-+]?\d+)?\s*$/i, empty: /^\s*$/ }; - this.setOptions = function(opt) - { - opt = validConfig(opt); - _config = { - delimiter: opt.delimiter, - header: opt.header, - dynamicTyping: opt.dynamicTyping - }; - }; - - this.getOptions = function() - { - return { - delimiter: _config.delimiter, - header: _config.header, - dynamicTyping: _config.dynamicTyping - }; - }; - this.parse = function(input) { if (typeof input !== 'string') @@ -154,8 +138,17 @@ reset(input); + if (!_config.delimiter && !guessDelimiter(input)) + { + addError("Delimiter", "UndetectableDelimiter", "Unable to auto-detect delimiting character; defaulted to comma", "config"); + _config.delimiter = ","; + } + for (_state.i = 0; _state.i < _input.length; _state.i++) { + if (_config.preview > 0 && _state.row >= _config.preview) + break; + _state.ch = _input[_state.i]; _state.line += _state.ch; @@ -175,6 +168,27 @@ return returnable(); }; + this.setOptions = function(opt) + { + opt = validConfig(opt); + _config = { + delimiter: opt.delimiter, + header: opt.header, + dynamicTyping: opt.dynamicTyping, + preview: opt.preview + }; + }; + + this.getOptions = function() + { + return { + delimiter: _config.delimiter, + header: _config.header, + dynamicTyping: _config.dynamicTyping, + preview: _config.preview + }; + }; + this.setOptions(config); function validConfig(config) @@ -183,8 +197,8 @@ || config.delimiter.length != 1) config.delimiter = _defaultConfig.delimiter; - if (config.delimiter == '"' || config.delimiter == "\n") - config.delimiter = _defaultConfig.delimiter; + if (config.deimiter == '"' || config.delimiter == "\n") + config.delimitelr = _defaultConfig.delimiter; if (typeof config.header !== 'boolean') config.header = _defaultConfig.header; @@ -192,9 +206,61 @@ if (typeof config.dynamicTyping !== 'boolean') config.dynamicTyping = _defaultConfig.dynamicTyping; + if (typeof config.preview !== 'number') + config.preview = _defaultConfig.preview; + return config; } + function guessDelimiter(input) + { + var delimiters = [",", "\t", "|", ";"]; + var bestDelim, bestDelta, fieldCountPrevRow; + + for (var i in delimiters) + { + var delim = delimiters[i]; + var delta = 0, avgFieldCount = 0; + + var preview = new Parser({ + delimiter: delim, + header: false, + dynamicTyping: false, + preview: 10 + }).parse(input); + + for (var j in preview.results) + { + var fieldCount = preview.results[j].length; + avgFieldCount += fieldCount; + + if (typeof fieldCountPrevRow === 'undefined') + { + fieldCountPrevRow = fieldCount; + continue; + } + else if (fieldCount > 1) + { + delta += Math.abs(fieldCount - fieldCountPrevRow); + fieldCountPrevRow = fieldCount; + } + } + + avgFieldCount /= preview.results.length; + + if ((typeof bestDelta === 'undefined' || delta < bestDelta) + && avgFieldCount > 1.99) + { + bestDelta = delta; + bestDelim = delim; + } + } + + _config.delimiter = bestDelim; + + return !!bestDelim; + } + function emptyState() { return { @@ -393,16 +459,17 @@ return true; } - function addError(type, code, msg) + function addError(type, code, msg, errKey) { var row = _config.header - ? _state.parsed.rows.length - 1 + ? (_state.parsed.rows.length ? _state.parsed.rows.length - 1 : undefined) : _state.parsed.length - 1; + var key = errKey || row; - if (typeof _state.errors[row] === 'undefined') - _state.errors[row] = []; + if (typeof _state.errors[key] === 'undefined') + _state.errors[key] = []; - _state.errors[row].push({ + _state.errors[key].push({ type: type, code: code, message: msg, diff --git a/jquery.parse.min.js b/jquery.parse.min.js index a2daa16..40abc81 100644 --- a/jquery.parse.min.js +++ b/jquery.parse.min.js @@ -1,6 +1,6 @@ /* jQuery Parse Plugin - v1.0.1 + v1.1.0 https://github.com/mholt/jquery.parse */ -;(function(e){"use strict";function t(e){return typeof e==="function"}function n(e){return typeof e!=="undefined"}function r(e){function u(e){if(typeof e.delimiter!=="string"||e.delimiter.length!=1)e.delimiter=s.delimiter;if(e.delimiter=='"'||e.delimiter=="\n")e.delimiter=s.delimiter;if(typeof e.header!=="boolean")e.header=s.header;if(typeof e.dynamicTyping!=="boolean")e.dynamicTyping=s.dynamicTyping;return e}function a(){return{i:0,lineNum:1,field:0,fieldVal:"",line:"",ch:"",inQuotes:false,parsed:r.header?{fields:[],rows:[]}:[[]],errors:{length:0}}}function f(){var e=i.i>0&&p(i.i-1)||i.i==0;var t=i.i=n.length)return false;var t=n[e];if(t==r.delimiter||t=="\n"||t=="\r"&&e=n.length)return false;if(e0)i.parsed.rows.push({});else i.parsed.push([]);i.lineNum++;i.line="";i.field=0}function g(){v();var e=b();if(!e&&r.header)w()}function y(e){var t=o.floats.test(e);return t?parseFloat(e):e}function b(){if(o.empty.test(i.line)){if(r.header){if(i.lineNum==1){i.parsed.fields=[];i.lineNum--}else i.parsed.rows.splice(i.parsed.rows.length-1,1)}else i.parsed.splice(i.parsed.length-1,1);return true}return false}function w(){if(!r.header)return true;if(i.parsed.rows.length==0)return true;var e=i.parsed.fields.length;var t=0;var n=i.parsed.rows[i.parsed.rows.length-1];for(var s in n)if(n.hasOwnProperty(s))t++;if(te)return E("FieldMismatch","TooManyFields","Too many fields: expected "+e+" fields but parsed "+t);return true}function E(e,t,n){var s=r.header?i.parsed.rows.length-1:i.parsed.length-1;if(typeof i.errors[s]==="undefined")i.errors[s]=[];i.errors[s].push({type:e,code:t,message:n,line:i.lineNum,row:s,index:i.i});i.errors.length++;return false}function S(){return{results:i.parsed,errors:i.errors}}function x(e){i=a();n=e}var t=this;var n="";var r={};var i=a();var s={delimiter:",",header:true,dynamicTyping:true};var o={floats:/^\s*-?(\d*\.?\d+|\d+\.?\d*)(e[-+]?\d+)?\s*$/i,empty:/^\s*$/};this.setOptions=function(e){e=u(e);r={delimiter:e.delimiter,header:e.header,dynamicTyping:e.dynamicTyping}};this.getOptions=function(){return{delimiter:r.delimiter,header:r.header,dynamicTyping:r.dynamicTyping}};this.parse=function(e){if(typeof e!=="string")return S();x(e);for(i.i=0;i.i1){f+=Math.abs(p-o);o=p}}l/=c.results.length;if((typeof s==="undefined"||f1.99){s=f;n=a}}i.delimiter=n;return!!n}function l(){return{i:0,lineNum:1,field:0,fieldVal:"",line:"",ch:"",inQuotes:false,parsed:i.header?{fields:[],rows:[]}:[[]],errors:{length:0}}}function c(){var e=s.i>0&&v(s.i-1)||s.i==0;var t=s.i=n.length)return false;var t=n[e];if(t==i.delimiter||t=="\n"||t=="\r"&&e=n.length)return false;if(e0)s.parsed.rows.push({});else s.parsed.push([]);s.lineNum++;s.line="";s.field=0}function b(){g();var e=E();if(!e&&i.header)S()}function w(e){var t=u.floats.test(e);return t?parseFloat(e):e}function E(){if(u.empty.test(s.line)){if(i.header){if(s.lineNum==1){s.parsed.fields=[];s.lineNum--}else s.parsed.rows.splice(s.parsed.rows.length-1,1)}else s.parsed.splice(s.parsed.length-1,1);return true}return false}function S(){if(!i.header)return true;if(s.parsed.rows.length==0)return true;var e=s.parsed.fields.length;var t=0;var n=s.parsed.rows[s.parsed.rows.length-1];for(var r in n)if(n.hasOwnProperty(r))t++;if(te)return x("FieldMismatch","TooManyFields","Too many fields: expected "+e+" fields but parsed "+t);return true}function x(e,t,n,r){var o=i.header?s.parsed.rows.length?s.parsed.rows.length-1:undefined:s.parsed.length-1;var u=r||o;if(typeof s.errors[u]==="undefined")s.errors[u]=[];s.errors[u].push({type:e,code:t,message:n,line:s.lineNum,row:o,index:s.i});s.errors.length++;return false}function T(){return{results:s.parsed,errors:s.errors}}function N(e){s=l();n=e}var t=this;var n="";var i={};var s=l();var o={delimiter:"",header:true,dynamicTyping:true,preview:0};var u={floats:/^\s*-?(\d*\.?\d+|\d+\.?\d*)(e[-+]?\d+)?\s*$/i,empty:/^\s*$/};this.parse=function(e){if(typeof e!=="string")return T();N(e);if(!i.delimiter&&!f(e)){x("Delimiter","UndetectableDelimiter","Unable to auto-detect delimiting character; defaulted to comma","config");i.delimiter=","}for(s.i=0;s.i0&&s.row>=i.preview)break;s.ch=n[s.i];s.line+=s.ch;if(s.ch=='"')c();else if(s.inQuotes)h();else d()}b();if(s.inQuotes)x("Quotes","MissingQuotes","Unescaped or mismatched quotes");return T()};this.setOptions=function(e){e=a(e);i={delimiter:e.delimiter,header:e.header,dynamicTyping:e.dynamicTyping,preview:e.preview}};this.getOptions=function(){return{delimiter:i.delimiter,header:i.header,dynamicTyping:i.dynamicTyping,preview:i.preview}};this.setOptions(e)}e.fn.parse=function(r){function i(e,n,i){if(t(r.error))r.error({name:e},n,i)}var s=n(r.config)?r.config:{};this.each(function(o){var u=e(this).prop("tagName").toUpperCase()=="INPUT"&&e(this).attr("type")=="file"&&window.FileReader;if(!u)return true;var a={delimiter:s.delimiter,header:s.header,dynamicTyping:s.dynamicTyping};if(!this.files||this.files.length==0){i("NoFileError",undefined,this);return true}for(var f=0;f