Skip to content

Adds a eagerTextCapture option. #124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions lib/Parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ var voidElements = {
var re_nameEnd = /\s|\//;

function Parser(cbs, options){
this._options = options || {};

this._cbs = cbs || {};

this._tagname = "";
Expand All @@ -105,6 +105,9 @@ function Parser(cbs, options){
this.startIndex = 0;
this.endIndex = null;

this._options = options || {};
if(this._cbs.onparserinit) this._cbs.onparserinit(this, this._options);

this._lowerCaseTagNames = "lowerCaseTags" in this._options ?
!!this._options.lowerCaseTags :
!this._options.xmlMode;
Expand All @@ -114,7 +117,6 @@ function Parser(cbs, options){

this._tokenizer = new Tokenizer(this._options, this);

if(this._cbs.onparserinit) this._cbs.onparserinit(this);
}

require("util").inherits(Parser, require("events").EventEmitter);
Expand Down Expand Up @@ -317,8 +319,6 @@ Parser.prototype.reset = function(){
this._attribname = "";
this._attribs = null;
this._stack = [];

if(this._cbs.onparserinit) this._cbs.onparserinit(this);
};

//Parses a complete HTML document and pushes it to the handler
Expand Down
109 changes: 66 additions & 43 deletions lib/Tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ function consumeSpecialNameChar(upper, NEXT_STATE){
function Tokenizer(options, cbs){
this._state = TEXT;
this._buffer = "";
this._content = "";
this._sectionStart = 0;
this._index = 0;
this._bufferOffset = 0; //chars removed from _buffer
Expand All @@ -144,19 +145,18 @@ function Tokenizer(options, cbs){
this._ended = false;
this._xmlMode = !!(options && options.xmlMode);
this._decodeEntities = !!(options && options.decodeEntities);
this._eagerTextCapture = !!(options && options.eagerTextCapture);


}

Tokenizer.prototype._stateText = function(c){
if(c === "<"){
if(this._index > this._sectionStart){
this._cbs.ontext(this._getSection());
}
this._captureText();
this._state = BEFORE_TAG_NAME;
this._sectionStart = this._index;
} else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){
if(this._index > this._sectionStart){
this._cbs.ontext(this._getSection());
}
this._captureText();
this._baseState = TEXT;
this._state = BEFORE_ENTITY;
this._sectionStart = this._index;
Expand All @@ -175,7 +175,7 @@ Tokenizer.prototype._stateBeforeTagName = function(c){
this._state = IN_PROCESSING_INSTRUCTION;
this._sectionStart = this._index + 1;
} else if(c === "<"){
this._cbs.ontext(this._getSection());
this._captureText();
this._sectionStart = this._index;
} else {
this._state = (!this._xmlMode && (c === "s" || c === "S")) ?
Expand All @@ -186,6 +186,7 @@ Tokenizer.prototype._stateBeforeTagName = function(c){

Tokenizer.prototype._stateInTagName = function(c){
if(c === "/" || c === ">" || whitespace(c)){
this._flushText();
this._emitToken("onopentagname");
this._state = BEFORE_ATTRIBUTE_NAME;
this._index--;
Expand All @@ -211,6 +212,7 @@ Tokenizer.prototype._stateBeforeCloseingTagName = function(c){

Tokenizer.prototype._stateInCloseingTagName = function(c){
if(c === ">" || whitespace(c)){
this._flushText();
this._emitToken("onclosetag");
this._state = AFTER_CLOSING_TAG_NAME;
this._index--;
Expand Down Expand Up @@ -602,9 +604,7 @@ Tokenizer.prototype._cleanup = function (){
this._bufferOffset += this._index;
} else if(this._running){
if(this._state === TEXT){
if(this._sectionStart !== this._index){
this._cbs.ontext(this._buffer.substr(this._sectionStart));
}
this._captureText();
this._buffer = "";
this._index = 0;
this._bufferOffset += this._index;
Expand Down Expand Up @@ -831,51 +831,49 @@ Tokenizer.prototype.end = function(chunk){

Tokenizer.prototype._finish = function(){
//if there is remaining data, emit it in a reasonable way
if(this._sectionStart < this._index){
this._handleTrailingData();
}

this._handleTrailingData();
this._cbs.onend();
};

Tokenizer.prototype._handleTrailingData = function(){
var data = this._buffer.substr(this._sectionStart);

if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
this._cbs.oncdata(data);
} else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
this._cbs.oncomment(data);
} else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
this._parseLegacyEntity();
if(this._sectionStart < this._index){

if(this._sectionStart < this._index){
var data = this._buffer.substr(this._sectionStart);
if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
this._cbs.oncdata(data);
} else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
this._cbs.oncomment(data);
} else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
this._parseLegacyEntity();
this._state = this._baseState;
this._handleTrailingData();
}
} else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
this._decodeNumericEntity(2, 10);
if(this._sectionStart < this._index){
} else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
this._decodeNumericEntity(2, 10);
this._state = this._baseState;
this._handleTrailingData();
}
} else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
this._decodeNumericEntity(3, 16);
if(this._sectionStart < this._index){
} else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
this._decodeNumericEntity(3, 16);
this._state = this._baseState;
this._handleTrailingData();
} else if(
this._state !== IN_TAG_NAME &&
this._state !== BEFORE_ATTRIBUTE_NAME &&
this._state !== BEFORE_ATTRIBUTE_VALUE &&
this._state !== AFTER_ATTRIBUTE_NAME &&
this._state !== IN_ATTRIBUTE_NAME &&
this._state !== IN_ATTRIBUTE_VALUE_SQ &&
this._state !== IN_ATTRIBUTE_VALUE_DQ &&
this._state !== IN_ATTRIBUTE_VALUE_NQ &&
this._state !== IN_CLOSING_TAG_NAME
){
this._captureText();
this._flushText();
}
} else if(
this._state !== IN_TAG_NAME &&
this._state !== BEFORE_ATTRIBUTE_NAME &&
this._state !== BEFORE_ATTRIBUTE_VALUE &&
this._state !== AFTER_ATTRIBUTE_NAME &&
this._state !== IN_ATTRIBUTE_NAME &&
this._state !== IN_ATTRIBUTE_VALUE_SQ &&
this._state !== IN_ATTRIBUTE_VALUE_DQ &&
this._state !== IN_ATTRIBUTE_VALUE_NQ &&
this._state !== IN_CLOSING_TAG_NAME
){
this._cbs.ontext(data);
} else if(this._state === TEXT){
this._flushText();
}

//else, ignore remaining data
//TODO add a way to remove current tag
};
Expand All @@ -901,6 +899,31 @@ Tokenizer.prototype._emitPartial = function(value){
if(this._baseState !== TEXT){
this._cbs.onattribdata(value); //TODO implement the new event
} else {
this._cbs.ontext(value);
if(this._eagerTextCapture) {
this._content += value;
} else {
this._cbs.ontext(value);
}
}
};

Tokenizer.prototype._captureText = function(){
if(this._index <= this._sectionStart){
return;
}
var text = this._getSection();
if(!this._eagerTextCapture){
this._cbs.ontext(text);
} else {
this._content += text;
}
};

Tokenizer.prototype._flushText = function(){
if(!this._content){
return;
}
this._cbs.ontext(this._content);
this._content = "";
};

1 change: 1 addition & 0 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ module.exports = {
cdatastart: 0,
cdataend: 0,
text: 1,
parserinit: 2,
processinginstruction: 2,
comment: 1,
commentend: 0,
Expand Down
1 change: 1 addition & 0 deletions test/02-stream.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ helper.mochaTest("Stream", __dirname, function(test, cb){
new Stream(
helper.getEventCollector(function(err, events){
cb(err, events);
cb(err, events);

var handler = helper.getEventCollector(cb),
stream = new Stream(handler, test.options);
Expand Down
56 changes: 39 additions & 17 deletions test/test-helper.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,64 @@ var htmlparser2 = require(".."),
CollectingHandler = htmlparser2.CollectingHandler;

exports.writeToParser = function(handler, options, data){
var parser = new Parser(handler, options);
options = options || {};
var i, parser = new Parser(handler, options);

//first, try to run the test via chunks
for(var i = 0; i < data.length; i++){
for(i = 0; i < data.length; i++){
parser.write(data.charAt(i));
}
parser.end();

//then parse everything
parser.parseComplete(data);
parser.reset();

//and once again using the `'eagerTextCapture'` option
options.eagerTextCapture = true;
parser = new Parser(handler, options);
for(i = 0; i < data.length; i++){
parser.write(data.charAt(i));
}
parser.end();
};

//returns a tree structure
exports.getEventCollector = function(cb){
var handler = new CollectingHandler({onerror: cb, onend: onend});
var handler = new CollectingHandler({onparserinit: init, onerror: cb, onend: onend});

return handler;

function init(parser, parserOptions){
this._parser = parser;
this._parserOptions = parserOptions;
}

function onend(){
cb(null, handler.events.reduce(eventReducer, []));
cb(null, eventReducer(handler.events, this._parserOptions.eagerTextCapture), false);
}
};

function eventReducer(events, arr){
if(arr[0] === "onerror" || arr[0] === "onend");
else if(arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){
events[events.length - 1].data[0] += arr[1];
} else {
events.push({
event: arr[0].substr(2),
data: arr.slice(1)
});
}
function eventReducer(toReduce, arr, eagerTextCapture){
var events = [];

toReduce.forEach(function(arr){
if(arr[0] === "onparserinit" || arr[0] === "onerror" || arr[0] === "onend"){
return;
} else if(!eagerTextCapture && arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){
events[events.length - 1].data[0] += arr[1];
} else {
events.push({
event: arr[0].substr(2),
data: arr.slice(1)
});
}
});
return events;
}

function getCallback(expected, done){
var repeated = false;
var repeated = 0;

return function(err, actual){
assert.ifError(err);
Expand All @@ -51,11 +72,12 @@ function getCallback(expected, done){
} catch(e){
e.expected = JSON.stringify(expected, null, 2);
e.actual = JSON.stringify(actual, null, 2);
console.log(e.actual);
throw e;
}

if(repeated) done();
else repeated = true;
if(repeated === 2) done();
else repeated++;
};
}

Expand Down