diff --git a/lib/Parser.js b/lib/Parser.js index 3db01b4fc..1a37888c2 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -93,7 +93,7 @@ var voidElements = { var re_nameEnd = /\s|\//; function Parser(cbs, options){ - this._options = options || {}; + this._cbs = cbs || {}; this._tagname = ""; @@ -105,6 +105,9 @@ function Parser(cbs, options){ this.startIndex = 0; this.endIndex = null; + this._options = options || {}; + if(this._cbs.onparserinit) this._cbs.onparserinit(this, this._options); + this._lowerCaseTagNames = "lowerCaseTags" in this._options ? !!this._options.lowerCaseTags : !this._options.xmlMode; @@ -114,7 +117,6 @@ function Parser(cbs, options){ this._tokenizer = new Tokenizer(this._options, this); - if(this._cbs.onparserinit) this._cbs.onparserinit(this); } require("util").inherits(Parser, require("events").EventEmitter); @@ -317,8 +319,6 @@ Parser.prototype.reset = function(){ this._attribname = ""; this._attribs = null; this._stack = []; - - if(this._cbs.onparserinit) this._cbs.onparserinit(this); }; //Parses a complete HTML document and pushes it to the handler diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index ec013c127..5f0152b90 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -134,6 +134,7 @@ function consumeSpecialNameChar(upper, NEXT_STATE){ function Tokenizer(options, cbs){ this._state = TEXT; this._buffer = ""; + this._content = ""; this._sectionStart = 0; this._index = 0; this._bufferOffset = 0; //chars removed from _buffer @@ -144,19 +145,18 @@ function Tokenizer(options, cbs){ this._ended = false; this._xmlMode = !!(options && options.xmlMode); this._decodeEntities = !!(options && options.decodeEntities); + this._eagerTextCapture = !!(options && options.eagerTextCapture); + + } Tokenizer.prototype._stateText = function(c){ if(c === "<"){ - if(this._index > this._sectionStart){ - this._cbs.ontext(this._getSection()); - } + this._captureText(); this._state = BEFORE_TAG_NAME; this._sectionStart = this._index; } else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){ - if(this._index > this._sectionStart){ - this._cbs.ontext(this._getSection()); - } + this._captureText(); this._baseState = TEXT; this._state = BEFORE_ENTITY; this._sectionStart = this._index; @@ -175,7 +175,7 @@ Tokenizer.prototype._stateBeforeTagName = function(c){ this._state = IN_PROCESSING_INSTRUCTION; this._sectionStart = this._index + 1; } else if(c === "<"){ - this._cbs.ontext(this._getSection()); + this._captureText(); this._sectionStart = this._index; } else { this._state = (!this._xmlMode && (c === "s" || c === "S")) ? @@ -186,6 +186,7 @@ Tokenizer.prototype._stateBeforeTagName = function(c){ Tokenizer.prototype._stateInTagName = function(c){ if(c === "/" || c === ">" || whitespace(c)){ + this._flushText(); this._emitToken("onopentagname"); this._state = BEFORE_ATTRIBUTE_NAME; this._index--; @@ -211,6 +212,7 @@ Tokenizer.prototype._stateBeforeCloseingTagName = function(c){ Tokenizer.prototype._stateInCloseingTagName = function(c){ if(c === ">" || whitespace(c)){ + this._flushText(); this._emitToken("onclosetag"); this._state = AFTER_CLOSING_TAG_NAME; this._index--; @@ -602,9 +604,7 @@ Tokenizer.prototype._cleanup = function (){ this._bufferOffset += this._index; } else if(this._running){ if(this._state === TEXT){ - if(this._sectionStart !== this._index){ - this._cbs.ontext(this._buffer.substr(this._sectionStart)); - } + this._captureText(); this._buffer = ""; this._index = 0; this._bufferOffset += this._index; @@ -831,51 +831,49 @@ Tokenizer.prototype.end = function(chunk){ Tokenizer.prototype._finish = function(){ //if there is remaining data, emit it in a reasonable way - if(this._sectionStart < this._index){ - this._handleTrailingData(); - } + this._handleTrailingData(); this._cbs.onend(); }; Tokenizer.prototype._handleTrailingData = function(){ - var data = this._buffer.substr(this._sectionStart); - - if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){ - this._cbs.oncdata(data); - } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){ - this._cbs.oncomment(data); - } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){ - this._parseLegacyEntity(); - if(this._sectionStart < this._index){ + + if(this._sectionStart < this._index){ + var data = this._buffer.substr(this._sectionStart); + if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){ + this._cbs.oncdata(data); + } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){ + this._cbs.oncomment(data); + } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){ + this._parseLegacyEntity(); this._state = this._baseState; this._handleTrailingData(); - } - } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){ - this._decodeNumericEntity(2, 10); - if(this._sectionStart < this._index){ + } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){ + this._decodeNumericEntity(2, 10); this._state = this._baseState; this._handleTrailingData(); - } - } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){ - this._decodeNumericEntity(3, 16); - if(this._sectionStart < this._index){ + } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){ + this._decodeNumericEntity(3, 16); this._state = this._baseState; this._handleTrailingData(); + } else if( + this._state !== IN_TAG_NAME && + this._state !== BEFORE_ATTRIBUTE_NAME && + this._state !== BEFORE_ATTRIBUTE_VALUE && + this._state !== AFTER_ATTRIBUTE_NAME && + this._state !== IN_ATTRIBUTE_NAME && + this._state !== IN_ATTRIBUTE_VALUE_SQ && + this._state !== IN_ATTRIBUTE_VALUE_DQ && + this._state !== IN_ATTRIBUTE_VALUE_NQ && + this._state !== IN_CLOSING_TAG_NAME + ){ + this._captureText(); + this._flushText(); } - } else if( - this._state !== IN_TAG_NAME && - this._state !== BEFORE_ATTRIBUTE_NAME && - this._state !== BEFORE_ATTRIBUTE_VALUE && - this._state !== AFTER_ATTRIBUTE_NAME && - this._state !== IN_ATTRIBUTE_NAME && - this._state !== IN_ATTRIBUTE_VALUE_SQ && - this._state !== IN_ATTRIBUTE_VALUE_DQ && - this._state !== IN_ATTRIBUTE_VALUE_NQ && - this._state !== IN_CLOSING_TAG_NAME - ){ - this._cbs.ontext(data); + } else if(this._state === TEXT){ + this._flushText(); } + //else, ignore remaining data //TODO add a way to remove current tag }; @@ -901,6 +899,31 @@ Tokenizer.prototype._emitPartial = function(value){ if(this._baseState !== TEXT){ this._cbs.onattribdata(value); //TODO implement the new event } else { - this._cbs.ontext(value); + if(this._eagerTextCapture) { + this._content += value; + } else { + this._cbs.ontext(value); + } + } +}; + +Tokenizer.prototype._captureText = function(){ + if(this._index <= this._sectionStart){ + return; + } + var text = this._getSection(); + if(!this._eagerTextCapture){ + this._cbs.ontext(text); + } else { + this._content += text; } }; + +Tokenizer.prototype._flushText = function(){ + if(!this._content){ + return; + } + this._cbs.ontext(this._content); + this._content = ""; +}; + diff --git a/lib/index.js b/lib/index.js index 880f57e90..9301f3513 100644 --- a/lib/index.js +++ b/lib/index.js @@ -56,6 +56,7 @@ module.exports = { cdatastart: 0, cdataend: 0, text: 1, + parserinit: 2, processinginstruction: 2, comment: 1, commentend: 0, diff --git a/test/02-stream.js b/test/02-stream.js index 340398067..7d15a35dd 100644 --- a/test/02-stream.js +++ b/test/02-stream.js @@ -9,6 +9,7 @@ helper.mochaTest("Stream", __dirname, function(test, cb){ new Stream( helper.getEventCollector(function(err, events){ cb(err, events); + cb(err, events); var handler = helper.getEventCollector(cb), stream = new Stream(handler, test.options); diff --git a/test/test-helper.js b/test/test-helper.js index 90a9907c7..f8d0a8f5d 100644 --- a/test/test-helper.js +++ b/test/test-helper.js @@ -6,43 +6,64 @@ var htmlparser2 = require(".."), CollectingHandler = htmlparser2.CollectingHandler; exports.writeToParser = function(handler, options, data){ - var parser = new Parser(handler, options); + options = options || {}; + var i, parser = new Parser(handler, options); + //first, try to run the test via chunks - for(var i = 0; i < data.length; i++){ + for(i = 0; i < data.length; i++){ parser.write(data.charAt(i)); } parser.end(); + //then parse everything parser.parseComplete(data); + parser.reset(); + + //and once again using the `'eagerTextCapture'` option + options.eagerTextCapture = true; + parser = new Parser(handler, options); + for(i = 0; i < data.length; i++){ + parser.write(data.charAt(i)); + } + parser.end(); }; //returns a tree structure exports.getEventCollector = function(cb){ - var handler = new CollectingHandler({onerror: cb, onend: onend}); + var handler = new CollectingHandler({onparserinit: init, onerror: cb, onend: onend}); return handler; + function init(parser, parserOptions){ + this._parser = parser; + this._parserOptions = parserOptions; + } + function onend(){ - cb(null, handler.events.reduce(eventReducer, [])); + cb(null, eventReducer(handler.events, this._parserOptions.eagerTextCapture), false); } }; -function eventReducer(events, arr){ - if(arr[0] === "onerror" || arr[0] === "onend"); - else if(arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){ - events[events.length - 1].data[0] += arr[1]; - } else { - events.push({ - event: arr[0].substr(2), - data: arr.slice(1) - }); - } +function eventReducer(toReduce, arr, eagerTextCapture){ + var events = []; + toReduce.forEach(function(arr){ + if(arr[0] === "onparserinit" || arr[0] === "onerror" || arr[0] === "onend"){ + return; + } else if(!eagerTextCapture && arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){ + events[events.length - 1].data[0] += arr[1]; + } else { + events.push({ + event: arr[0].substr(2), + data: arr.slice(1) + }); + } + }); return events; } function getCallback(expected, done){ - var repeated = false; + var repeated = 0; return function(err, actual){ assert.ifError(err); @@ -51,11 +72,12 @@ function getCallback(expected, done){ } catch(e){ e.expected = JSON.stringify(expected, null, 2); e.actual = JSON.stringify(actual, null, 2); + console.log(e.actual); throw e; } - if(repeated) done(); - else repeated = true; + if(repeated === 2) done(); + else repeated++; }; }