Adds a eagerTextCapture option.

jails · jails · commit 25d75e733dd9 · 2015-04-05T21:35:24.000+02:00
diff --git a/lib/Parser.js b/lib/Parser.js
@@ -93,7 +93,7 @@ var voidElements = {
 var re_nameEnd = /\s|\//;
 
 function Parser(cbs, options){
-	this._options = options || {};
+
 	this._cbs = cbs || {};
 
 	this._tagname = "";
@@ -105,6 +105,9 @@ function Parser(cbs, options){
 	this.startIndex = 0;
 	this.endIndex = null;
 
+	this._options = options || {};
+	if(this._cbs.onparserinit) this._cbs.onparserinit(this, this._options);
+
 	this._lowerCaseTagNames = "lowerCaseTags" in this._options ?
 									!!this._options.lowerCaseTags :
 									!this._options.xmlMode;
@@ -114,7 +117,6 @@ function Parser(cbs, options){
 
 	this._tokenizer = new Tokenizer(this._options, this);
 
-	if(this._cbs.onparserinit) this._cbs.onparserinit(this);
 }
 
 require("util").inherits(Parser, require("events").EventEmitter);
@@ -317,8 +319,6 @@ Parser.prototype.reset = function(){
 	this._attribname = "";
 	this._attribs = null;
 	this._stack = [];
-
-	if(this._cbs.onparserinit) this._cbs.onparserinit(this);
 };
 
 //Parses a complete HTML document and pushes it to the handler
diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
@@ -134,6 +134,7 @@ function consumeSpecialNameChar(upper, NEXT_STATE){
 function Tokenizer(options, cbs){
 	this._state = TEXT;
 	this._buffer = "";
+	this._content = "";
 	this._sectionStart = 0;
 	this._index = 0;
 	this._bufferOffset = 0; //chars removed from _buffer
@@ -144,19 +145,18 @@ function Tokenizer(options, cbs){
 	this._ended = false;
 	this._xmlMode = !!(options && options.xmlMode);
 	this._decodeEntities = !!(options && options.decodeEntities);
+	this._eagerTextCapture = !!(options && options.eagerTextCapture);
+
+
 }
 
 Tokenizer.prototype._stateText = function(c){
 	if(c === "<"){
-		if(this._index > this._sectionStart){
-			this._cbs.ontext(this._getSection());
-		}
+		this._captureText();
 		this._state = BEFORE_TAG_NAME;
 		this._sectionStart = this._index;
 	} else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){
-		if(this._index > this._sectionStart){
-			this._cbs.ontext(this._getSection());
-		}
+		this._captureText();
 		this._baseState = TEXT;
 		this._state = BEFORE_ENTITY;
 		this._sectionStart = this._index;
@@ -175,7 +175,7 @@ Tokenizer.prototype._stateBeforeTagName = function(c){
 		this._state = IN_PROCESSING_INSTRUCTION;
 		this._sectionStart = this._index + 1;
 	} else if(c === "<"){
-		this._cbs.ontext(this._getSection());
+		this._captureText();
 		this._sectionStart = this._index;
 	} else {
 		this._state = (!this._xmlMode && (c === "s" || c === "S")) ?
@@ -186,6 +186,7 @@ Tokenizer.prototype._stateBeforeTagName = function(c){
 
 Tokenizer.prototype._stateInTagName = function(c){
 	if(c === "/" || c === ">" || whitespace(c)){
+		this._flushText();
 		this._emitToken("onopentagname");
 		this._state = BEFORE_ATTRIBUTE_NAME;
 		this._index--;
@@ -211,6 +212,7 @@ Tokenizer.prototype._stateBeforeCloseingTagName = function(c){
 
 Tokenizer.prototype._stateInCloseingTagName = function(c){
 	if(c === ">" || whitespace(c)){
+		this._flushText();
 		this._emitToken("onclosetag");
 		this._state = AFTER_CLOSING_TAG_NAME;
 		this._index--;
@@ -602,9 +604,7 @@ Tokenizer.prototype._cleanup = function (){
 		this._bufferOffset += this._index;
 	} else if(this._running){
 		if(this._state === TEXT){
-			if(this._sectionStart !== this._index){
-				this._cbs.ontext(this._buffer.substr(this._sectionStart));
-			}
+			this._captureText();
 			this._buffer = "";
 			this._index = 0;
 			this._bufferOffset += this._index;
@@ -831,51 +831,49 @@ Tokenizer.prototype.end = function(chunk){
 
 Tokenizer.prototype._finish = function(){
 	//if there is remaining data, emit it in a reasonable way
-	if(this._sectionStart < this._index){
-		this._handleTrailingData();
-	}
 
+	this._handleTrailingData();
 	this._cbs.onend();
 };
 
 Tokenizer.prototype._handleTrailingData = function(){
-	var data = this._buffer.substr(this._sectionStart);
-
-	if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
-		this._cbs.oncdata(data);
-	} else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
-		this._cbs.oncomment(data);
-	} else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
-		this._parseLegacyEntity();
-		if(this._sectionStart < this._index){
+
+	if(this._sectionStart < this._index){
+		var data = this._buffer.substr(this._sectionStart);
+		if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
+			this._cbs.oncdata(data);
+		} else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
+			this._cbs.oncomment(data);
+		} else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
+			this._parseLegacyEntity();
 			this._state = this._baseState;
 			this._handleTrailingData();
-		}
-	} else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
-		this._decodeNumericEntity(2, 10);
-		if(this._sectionStart < this._index){
+		} else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
+			this._decodeNumericEntity(2, 10);
 			this._state = this._baseState;
 			this._handleTrailingData();
-		}
-	} else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
-		this._decodeNumericEntity(3, 16);
-		if(this._sectionStart < this._index){
+		} else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
+			this._decodeNumericEntity(3, 16);
 			this._state = this._baseState;
 			this._handleTrailingData();
+		} else if(
+			this._state !== IN_TAG_NAME &&
+			this._state !== BEFORE_ATTRIBUTE_NAME &&
+			this._state !== BEFORE_ATTRIBUTE_VALUE &&
+			this._state !== AFTER_ATTRIBUTE_NAME &&
+			this._state !== IN_ATTRIBUTE_NAME &&
+			this._state !== IN_ATTRIBUTE_VALUE_SQ &&
+			this._state !== IN_ATTRIBUTE_VALUE_DQ &&
+			this._state !== IN_ATTRIBUTE_VALUE_NQ &&
+			this._state !== IN_CLOSING_TAG_NAME
+		){
+			this._captureText();
+			this._flushText();
 		}
-	} else if(
-		this._state !== IN_TAG_NAME &&
-		this._state !== BEFORE_ATTRIBUTE_NAME &&
-		this._state !== BEFORE_ATTRIBUTE_VALUE &&
-		this._state !== AFTER_ATTRIBUTE_NAME &&
-		this._state !== IN_ATTRIBUTE_NAME &&
-		this._state !== IN_ATTRIBUTE_VALUE_SQ &&
-		this._state !== IN_ATTRIBUTE_VALUE_DQ &&
-		this._state !== IN_ATTRIBUTE_VALUE_NQ &&
-		this._state !== IN_CLOSING_TAG_NAME
-	){
-		this._cbs.ontext(data);
+	} else if(this._state === TEXT){
+		this._flushText();
 	}
+
 	//else, ignore remaining data
 	//TODO add a way to remove current tag
 };
@@ -901,6 +899,31 @@ Tokenizer.prototype._emitPartial = function(value){
 	if(this._baseState !== TEXT){
 		this._cbs.onattribdata(value); //TODO implement the new event
 	} else {
-		this._cbs.ontext(value);
+		if(this._eagerTextCapture) {
+			this._content += value;
+		} else {
+			this._cbs.ontext(value);
+		}
+	}
+};
+
+Tokenizer.prototype._captureText = function(){
+	if(this._index <= this._sectionStart){
+		return;
+	}
+	var text = this._getSection();
+	if(!this._eagerTextCapture){
+		this._cbs.ontext(text);
+	} else {
+		this._content += text;
 	}
 };
+
+Tokenizer.prototype._flushText = function(){
+	if(!this._content){
+		return;
+	}
+	this._cbs.ontext(this._content);
+	this._content = "";
+};
+
diff --git a/lib/index.js b/lib/index.js
@@ -56,6 +56,7 @@ module.exports = {
 		cdatastart: 0,
 		cdataend: 0,
 		text: 1,
+		parserinit: 2,
 		processinginstruction: 2,
 		comment: 1,
 		commentend: 0,
diff --git a/test/02-stream.js b/test/02-stream.js
@@ -9,6 +9,7 @@ helper.mochaTest("Stream", __dirname, function(test, cb){
 		new Stream(
 			helper.getEventCollector(function(err, events){
 				cb(err, events);
+				cb(err, events);
 
 				var handler = helper.getEventCollector(cb),
 				    stream = new Stream(handler, test.options);
diff --git a/test/test-helper.js b/test/test-helper.js
@@ -6,43 +6,64 @@ var htmlparser2 = require(".."),
 	CollectingHandler = htmlparser2.CollectingHandler;
 
 exports.writeToParser = function(handler, options, data){
-	var parser = new Parser(handler, options);
+	options = options || {};
+	var i, parser = new Parser(handler, options);
+
 	//first, try to run the test via chunks
-	for(var i = 0; i < data.length; i++){
+	for(i = 0; i < data.length; i++){
 		parser.write(data.charAt(i));
 	}
 	parser.end();
+
 	//then parse everything
 	parser.parseComplete(data);
+	parser.reset();
+
+	//and once again using the `'eagerTextCapture'` option
+	options.eagerTextCapture = true;
+	parser = new Parser(handler, options);
+	for(i = 0; i < data.length; i++){
+		parser.write(data.charAt(i));
+	}
+	parser.end();
 };
 
 //returns a tree structure
 exports.getEventCollector = function(cb){
-	var handler = new CollectingHandler({onerror: cb, onend: onend});
+	var handler = new CollectingHandler({onparserinit: init, onerror: cb, onend: onend});
 
 	return handler;
 
+	function init(parser, parserOptions){
+		this._parser = parser;
+		this._parserOptions = parserOptions;
+	}
+
 	function onend(){
-		cb(null, handler.events.reduce(eventReducer, []));
+		cb(null, eventReducer(handler.events, this._parserOptions.eagerTextCapture), false);
 	}
 };
 
-function eventReducer(events, arr){
-	if(arr[0] === "onerror" || arr[0] === "onend");
-	else if(arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){
-		events[events.length - 1].data[0] += arr[1];
-	} else {
-		events.push({
-			event: arr[0].substr(2),
-			data: arr.slice(1)
-		});
-	}
+function eventReducer(toReduce, arr, eagerTextCapture){
+	var events = [];
 
+	toReduce.forEach(function(arr){
+		if(arr[0] === "onparserinit" || arr[0] === "onerror" || arr[0] === "onend"){
+			return;
+		} else if(!eagerTextCapture && arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){
+			events[events.length - 1].data[0] += arr[1];
+		} else {
+			events.push({
+				event: arr[0].substr(2),
+				data: arr.slice(1)
+			});
+		}
+	});
 	return events;
 }
 
 function getCallback(expected, done){
-	var repeated = false;
+	var repeated = 0;
 
 	return function(err, actual){
 		assert.ifError(err);
@@ -51,11 +72,12 @@ function getCallback(expected, done){
 		} catch(e){
 			e.expected = JSON.stringify(expected, null, 2);
 			e.actual = JSON.stringify(actual, null, 2);
+			console.log(e.actual);
 			throw e;
 		}
 
-		if(repeated) done();
-		else repeated = true;
+		if(repeated === 2) done();
+		else repeated++;
 	};
 }