From 12fbcc6252c730fd4b1f6fca967f6a943bdcb801 Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Wed, 9 Oct 2013 08:15:44 -0700 Subject: [PATCH] Handling extra fields better; updated read me --- README.md | 228 ++++++++++++++++++++++++++++++++++++++------ jquery.parse.js | 7 +- jquery.parse.min.js | 4 +- parse.jquery.json | 2 +- 4 files changed, 208 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 732c763..abc10ab 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,102 @@ jquery.parse ============ -Robust, efficient CSV parsing (with nearly any delimiting character) +Robust, efficient CSV parsing (with nearly any delimiting character). Malformed CSV files are especially common, and this parser is an attempt to handle parsing errors more robustly and parse CSV text more efficiently. Basic usage ----------- +The second argument is optional, but here it is with the defaults: + ```javascript results = $.parse(csvString, { - delimiter: "\t", - header: true + delimiter: ",", + header: true, + dynamicTyping: true }); ``` -The default delimiter is `,` but can be set to anything anything except `"` or `\n`. +### Config options -By default, a header row is expected. The output and error handling depends on whether you include a header row with your data. +| Option | Description +|------------------ | ----------------- +| `delimiter` | The delimiting character. Usually just a comma or tab. Can be set to anything anything except `"` or `\n`. +| `header` | If true, interpret the first row of parsed data as a header column; fields are returned separately from the data, and data will be returned keyed to its field name. If false, the parser simply returns an array (list) of arrays (rows), including the first column. +| `dynamicTyping` | If true, fields that are strictly numeric will be converted to a number type. If false, each parsed datum is returned as a string. -**If `header: true`, the output looks like:** +### Output -```javascript +The output and error handling depends on whether you include a header row with your data. If you have a header, each row must have the same number of fields as the header row, or an error will be produced. + +**Example input:** + + Item,SKU,Cost,Quantity + Book,ABC1234,10.95,4 + Movie,DEF5678,29.99,3 + +**With header and dynamic typing:** + +```json { - errors: [ - // errors, if any (parsing should not throw exceptions) - ], - results: { - fields: [ - // field names from the header row - ], - rows: [ - // objects, where each field value is keyed to the field name - ] - } + "results": { + "fields": [ + "Item", + "SKU", + "Cost", + "Quantity" + ], + "rows": [ + { + "Item": "Book", + "SKU": "ABC1234", + "Cost": 10.95, + "Quantity": 4 + }, + { + "Item": "Movie", + "SKU": "DEF5678", + "Cost": 29.99, + "Quantity": 3 + } + ] + }, + "errors": [] } ``` +**Without headers and without dynamic typing:** -**If `header: false`, the output looks like:** - -```javascript +```json { - errors: [ - // errors, if any (parsing should not throw exceptions) - ], - results: [ - // each row is itself an array of values separated by delimiter - ] + "results": [ + [ + "Item", + "SKU", + "Cost", + "Quantity" + ], + [ + "Book", + "ABC1234", + "10.95", + "4" + ], + [ + "Movie", + "DEF5678", + "29.99", + "3" + ] + ], + "errors": [] } ``` -**Errors look like:** +Errors +------ + +Here is the structure of an error: ```javascript { @@ -59,4 +105,128 @@ By default, a header row is expected. The output and error handling depends on w row: 0, // Row index where error was index: 0 // Character index within original input } -``` \ No newline at end of file +``` + +(Assume again that the default config is used.) Suppose the input is malformed: + + Item,SKU,Cost,Quantity + Book,"ABC1234,10.95,4 + Movie,DEF5678,29.99,3 + +Notice the stray quotes on the second line. This is the output: + +```json +{ + "results": { + "fields": [ + "Item", + "SKU", + "Cost", + "Quantity" + ], + "rows": [ + { + "Item": "Book", + "SKU": "ABC1234,10.95,4\nMovie,DEF5678,29.99,3" + } + ] + }, + "errors": [ + { + "message": "Too few fields; expected 4 fields, parsed 2", + "line": 2, + "row": 0, + "index": 66 + }, + { + "message": "Unescaped or mismatched quotes", + "line": 2, + "row": 0, + "index": 66 + } + ] +} +``` + +If the header row is disabled, field counting does not occur, because there is no need to key the data to the field name: + +```json +{ + "results": [ + [ + "Item", + "SKU", + "Cost", + "Quantity" + ], + [ + "Book", + "ABC1234,10.95,4\nMovie,DEF5678,29.99,3" + ] + ], + "errors": [ + { + "message": "Unescaped or mismatched quotes", + "line": 2, + "row": 1, + "index": 66 + } + ] +} +``` + +But you will still be notified about the stray quotes, as shown above. + +Suppose a field value with a delimiter is not escaped: + + Item,SKU,Cost,Quantity + Book,ABC1234,10,95,4 + Movie,DEF5678,29.99,3 + +Again, notice the second line, "10,95" instead of "10.95". This field *should* be quoted: `"10,95"` but the parser handles the problem gracefully: + +```json +{ + "results": { + "fields": [ + "Item", + "SKU", + "Cost", + "Quantity" + ], + "rows": [ + { + "Item": "Book", + "SKU": "ABC1234", + "Cost": 10, + "Quantity": 95, + "__parsed_extra": [ + "4" + ] + }, + { + "Item": "Movie", + "SKU": "DEF5678", + "Cost": 29.99, + "Quantity": 3 + } + ] + }, + "errors": [ + { + "message": "Too many fields; expected 4 fields, found extra value: '4'", + "line": 2, + "row": 0, + "index": 43 + }, + { + "message": "Too few fields; expected 4 fields, parsed 5", + "line": 2, + "row": 0, + "index": 43 + } + ] +} +``` + +As you can see, any "extra" fields at the end, when using a header row, are simply tacked onto a special field named "__parsed_extra", in the order that the remaining line was parsed. \ No newline at end of file diff --git a/jquery.parse.js b/jquery.parse.js index 53f7f9b..a4c6ae6 100644 --- a/jquery.parse.js +++ b/jquery.parse.js @@ -1,6 +1,6 @@ /* jQuery Parse plugin - v0.5.1 + v0.5.2 https://github.com/mholt/jquery.parse */ @@ -225,7 +225,12 @@ currentRow[fieldName] = _state.fieldVal; } else + { + if (typeof currentRow.__parsed_extra === 'undefined') + currentRow.__parsed_extra = []; + currentRow.__parsed_extra.push(_state.fieldVal); addError("Too many fields; expected " + _state.parsed.fields.length + " fields, found extra value: '" + _state.fieldVal + "'"); + } } } else diff --git a/jquery.parse.min.js b/jquery.parse.min.js index 8f23089..7aef6a1 100644 --- a/jquery.parse.min.js +++ b/jquery.parse.min.js @@ -1,6 +1,6 @@ /* jQuery Parse plugin - v0.5.0 + v0.5.2 https://github.com/mholt/jquery.parse */ -;(function(e){function n(e){e.delimeter=e.delimiter||t.delimiter;e.header=typeof e.header==="undefined"?t.header:e.header;if(e.delimiter=='"'||e.delimiter=="\n")e.delimiter=t.delimiter;if(e.delimiter.length>1)e.delimiter=e.delimiter[0];return e}function r(e,t){function u(e){return e?{fields:[],rows:[]}:[[]]}function a(){return{i:0,line:1,field:0,fieldVal:"",ch:"",inQuotes:false,parsed:u(t.header)}}function f(){if(o.i0)o.parsed.rows.push({})}else o.parsed.push([]);o.line++;o.field=0}function v(){if(i.header){if(o.line==1){if(o.parsed.fields.length==1&&o.parsed.fields[0].length==0){o.parsed.fields=[];o.line--}}else{var e=o.parsed.rows[o.parsed.rows.length-1];if(!e[o.parsed.fields[0]])o.parsed.rows.splice(o.parsed.rows.length-1,1)}}else{var e=o.parsed[o.parsed.length-1];if(e.length==0||e[0].length==0)o.parsed.splice(o.parsed.length-1,1)}}function m(){if(!i.header)return true;if(o.parsed.rows.length==0)return true;var e=o.parsed.fields.length;var t=Object.keys(o.parsed.rows[o.parsed.rows.length-1]).length;if(e!=t)return g("Too few fields; expected "+e+" fields, parsed "+t);return true}function g(e){s.push({message:e,line:o.line,row:i.header?o.parsed.rows.length-1:o.parsed.length-1,index:o.i});return false}var n=this;var r=e;var i=t;var s=[];var o=a();this.parse=function(e){if(typeof e==="object")n.setConfig(e);else if(typeof e==="string")n.setInput(e);s=[];o=a();for(o.i=0;o.i1)e.delimiter=e.delimiter[0];return e}function r(e,t){function u(e){return e?{fields:[],rows:[]}:[[]]}function a(){return{i:0,line:1,field:0,fieldVal:"",ch:"",inQuotes:false,parsed:u(t.header)}}function f(){var e=o.i>0&&p(r[o.i-1]);var t=o.i0)o.parsed.rows.push({})}else o.parsed.push([]);o.line++;o.field=0}function m(e){var t=/^\d+(\.\d+)?$/.test(e);return t?parseFloat(e):e}function g(){if(i.header){if(o.line==1){if(o.parsed.fields.length==1&&o.parsed.fields[0].length==0){o.parsed.fields=[];o.line--}}else{var e=o.parsed.rows[o.parsed.rows.length-1];if(!e[o.parsed.fields[0]])o.parsed.rows.splice(o.parsed.rows.length-1,1)}}else{var e=o.parsed[o.parsed.length-1];if(e.length==0||e[0].length==0)o.parsed.splice(o.parsed.length-1,1)}}function y(){if(!i.header)return true;if(o.parsed.rows.length==0)return true;var e=o.parsed.fields.length;var t=Object.keys(o.parsed.rows[o.parsed.rows.length-1]).length;if(e!=t)return b("Too few fields; expected "+e+" fields, parsed "+t);return true}function b(e){s.push({message:e,line:o.line,row:i.header?o.parsed.rows.length-1:o.parsed.length-1,index:o.i});return false}var n=this;var r=e;var i=t;var s=[];var o=a();this.parse=function(e){if(typeof e==="object")n.setConfig(e);else if(typeof e==="string")n.setInput(e);s=[];o=a();for(o.i=0;o.i