Skip to content

Commit 25d75e7

Browse files
committed
Adds a eagerTextCapture option.
1 parent 9e770fc commit 25d75e7

File tree

5 files changed

+111
-64
lines changed

5 files changed

+111
-64
lines changed

lib/Parser.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ var voidElements = {
9393
var re_nameEnd = /\s|\//;
9494

9595
function Parser(cbs, options){
96-
this._options = options || {};
96+
9797
this._cbs = cbs || {};
9898

9999
this._tagname = "";
@@ -105,6 +105,9 @@ function Parser(cbs, options){
105105
this.startIndex = 0;
106106
this.endIndex = null;
107107

108+
this._options = options || {};
109+
if(this._cbs.onparserinit) this._cbs.onparserinit(this, this._options);
110+
108111
this._lowerCaseTagNames = "lowerCaseTags" in this._options ?
109112
!!this._options.lowerCaseTags :
110113
!this._options.xmlMode;
@@ -114,7 +117,6 @@ function Parser(cbs, options){
114117

115118
this._tokenizer = new Tokenizer(this._options, this);
116119

117-
if(this._cbs.onparserinit) this._cbs.onparserinit(this);
118120
}
119121

120122
require("util").inherits(Parser, require("events").EventEmitter);
@@ -317,8 +319,6 @@ Parser.prototype.reset = function(){
317319
this._attribname = "";
318320
this._attribs = null;
319321
this._stack = [];
320-
321-
if(this._cbs.onparserinit) this._cbs.onparserinit(this);
322322
};
323323

324324
//Parses a complete HTML document and pushes it to the handler

lib/Tokenizer.js

Lines changed: 66 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ function consumeSpecialNameChar(upper, NEXT_STATE){
134134
function Tokenizer(options, cbs){
135135
this._state = TEXT;
136136
this._buffer = "";
137+
this._content = "";
137138
this._sectionStart = 0;
138139
this._index = 0;
139140
this._bufferOffset = 0; //chars removed from _buffer
@@ -144,19 +145,18 @@ function Tokenizer(options, cbs){
144145
this._ended = false;
145146
this._xmlMode = !!(options && options.xmlMode);
146147
this._decodeEntities = !!(options && options.decodeEntities);
148+
this._eagerTextCapture = !!(options && options.eagerTextCapture);
149+
150+
147151
}
148152

149153
Tokenizer.prototype._stateText = function(c){
150154
if(c === "<"){
151-
if(this._index > this._sectionStart){
152-
this._cbs.ontext(this._getSection());
153-
}
155+
this._captureText();
154156
this._state = BEFORE_TAG_NAME;
155157
this._sectionStart = this._index;
156158
} else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){
157-
if(this._index > this._sectionStart){
158-
this._cbs.ontext(this._getSection());
159-
}
159+
this._captureText();
160160
this._baseState = TEXT;
161161
this._state = BEFORE_ENTITY;
162162
this._sectionStart = this._index;
@@ -175,7 +175,7 @@ Tokenizer.prototype._stateBeforeTagName = function(c){
175175
this._state = IN_PROCESSING_INSTRUCTION;
176176
this._sectionStart = this._index + 1;
177177
} else if(c === "<"){
178-
this._cbs.ontext(this._getSection());
178+
this._captureText();
179179
this._sectionStart = this._index;
180180
} else {
181181
this._state = (!this._xmlMode && (c === "s" || c === "S")) ?
@@ -186,6 +186,7 @@ Tokenizer.prototype._stateBeforeTagName = function(c){
186186

187187
Tokenizer.prototype._stateInTagName = function(c){
188188
if(c === "/" || c === ">" || whitespace(c)){
189+
this._flushText();
189190
this._emitToken("onopentagname");
190191
this._state = BEFORE_ATTRIBUTE_NAME;
191192
this._index--;
@@ -211,6 +212,7 @@ Tokenizer.prototype._stateBeforeCloseingTagName = function(c){
211212

212213
Tokenizer.prototype._stateInCloseingTagName = function(c){
213214
if(c === ">" || whitespace(c)){
215+
this._flushText();
214216
this._emitToken("onclosetag");
215217
this._state = AFTER_CLOSING_TAG_NAME;
216218
this._index--;
@@ -602,9 +604,7 @@ Tokenizer.prototype._cleanup = function (){
602604
this._bufferOffset += this._index;
603605
} else if(this._running){
604606
if(this._state === TEXT){
605-
if(this._sectionStart !== this._index){
606-
this._cbs.ontext(this._buffer.substr(this._sectionStart));
607-
}
607+
this._captureText();
608608
this._buffer = "";
609609
this._index = 0;
610610
this._bufferOffset += this._index;
@@ -831,51 +831,49 @@ Tokenizer.prototype.end = function(chunk){
831831

832832
Tokenizer.prototype._finish = function(){
833833
//if there is remaining data, emit it in a reasonable way
834-
if(this._sectionStart < this._index){
835-
this._handleTrailingData();
836-
}
837834

835+
this._handleTrailingData();
838836
this._cbs.onend();
839837
};
840838

841839
Tokenizer.prototype._handleTrailingData = function(){
842-
var data = this._buffer.substr(this._sectionStart);
843-
844-
if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
845-
this._cbs.oncdata(data);
846-
} else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
847-
this._cbs.oncomment(data);
848-
} else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
849-
this._parseLegacyEntity();
850-
if(this._sectionStart < this._index){
840+
841+
if(this._sectionStart < this._index){
842+
var data = this._buffer.substr(this._sectionStart);
843+
if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
844+
this._cbs.oncdata(data);
845+
} else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
846+
this._cbs.oncomment(data);
847+
} else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
848+
this._parseLegacyEntity();
851849
this._state = this._baseState;
852850
this._handleTrailingData();
853-
}
854-
} else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
855-
this._decodeNumericEntity(2, 10);
856-
if(this._sectionStart < this._index){
851+
} else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
852+
this._decodeNumericEntity(2, 10);
857853
this._state = this._baseState;
858854
this._handleTrailingData();
859-
}
860-
} else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
861-
this._decodeNumericEntity(3, 16);
862-
if(this._sectionStart < this._index){
855+
} else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
856+
this._decodeNumericEntity(3, 16);
863857
this._state = this._baseState;
864858
this._handleTrailingData();
859+
} else if(
860+
this._state !== IN_TAG_NAME &&
861+
this._state !== BEFORE_ATTRIBUTE_NAME &&
862+
this._state !== BEFORE_ATTRIBUTE_VALUE &&
863+
this._state !== AFTER_ATTRIBUTE_NAME &&
864+
this._state !== IN_ATTRIBUTE_NAME &&
865+
this._state !== IN_ATTRIBUTE_VALUE_SQ &&
866+
this._state !== IN_ATTRIBUTE_VALUE_DQ &&
867+
this._state !== IN_ATTRIBUTE_VALUE_NQ &&
868+
this._state !== IN_CLOSING_TAG_NAME
869+
){
870+
this._captureText();
871+
this._flushText();
865872
}
866-
} else if(
867-
this._state !== IN_TAG_NAME &&
868-
this._state !== BEFORE_ATTRIBUTE_NAME &&
869-
this._state !== BEFORE_ATTRIBUTE_VALUE &&
870-
this._state !== AFTER_ATTRIBUTE_NAME &&
871-
this._state !== IN_ATTRIBUTE_NAME &&
872-
this._state !== IN_ATTRIBUTE_VALUE_SQ &&
873-
this._state !== IN_ATTRIBUTE_VALUE_DQ &&
874-
this._state !== IN_ATTRIBUTE_VALUE_NQ &&
875-
this._state !== IN_CLOSING_TAG_NAME
876-
){
877-
this._cbs.ontext(data);
873+
} else if(this._state === TEXT){
874+
this._flushText();
878875
}
876+
879877
//else, ignore remaining data
880878
//TODO add a way to remove current tag
881879
};
@@ -901,6 +899,31 @@ Tokenizer.prototype._emitPartial = function(value){
901899
if(this._baseState !== TEXT){
902900
this._cbs.onattribdata(value); //TODO implement the new event
903901
} else {
904-
this._cbs.ontext(value);
902+
if(this._eagerTextCapture) {
903+
this._content += value;
904+
} else {
905+
this._cbs.ontext(value);
906+
}
907+
}
908+
};
909+
910+
Tokenizer.prototype._captureText = function(){
911+
if(this._index <= this._sectionStart){
912+
return;
913+
}
914+
var text = this._getSection();
915+
if(!this._eagerTextCapture){
916+
this._cbs.ontext(text);
917+
} else {
918+
this._content += text;
905919
}
906920
};
921+
922+
Tokenizer.prototype._flushText = function(){
923+
if(!this._content){
924+
return;
925+
}
926+
this._cbs.ontext(this._content);
927+
this._content = "";
928+
};
929+

lib/index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ module.exports = {
5656
cdatastart: 0,
5757
cdataend: 0,
5858
text: 1,
59+
parserinit: 2,
5960
processinginstruction: 2,
6061
comment: 1,
6162
commentend: 0,

test/02-stream.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ helper.mochaTest("Stream", __dirname, function(test, cb){
99
new Stream(
1010
helper.getEventCollector(function(err, events){
1111
cb(err, events);
12+
cb(err, events);
1213

1314
var handler = helper.getEventCollector(cb),
1415
stream = new Stream(handler, test.options);

test/test-helper.js

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,43 +6,64 @@ var htmlparser2 = require(".."),
66
CollectingHandler = htmlparser2.CollectingHandler;
77

88
exports.writeToParser = function(handler, options, data){
9-
var parser = new Parser(handler, options);
9+
options = options || {};
10+
var i, parser = new Parser(handler, options);
11+
1012
//first, try to run the test via chunks
11-
for(var i = 0; i < data.length; i++){
13+
for(i = 0; i < data.length; i++){
1214
parser.write(data.charAt(i));
1315
}
1416
parser.end();
17+
1518
//then parse everything
1619
parser.parseComplete(data);
20+
parser.reset();
21+
22+
//and once again using the `'eagerTextCapture'` option
23+
options.eagerTextCapture = true;
24+
parser = new Parser(handler, options);
25+
for(i = 0; i < data.length; i++){
26+
parser.write(data.charAt(i));
27+
}
28+
parser.end();
1729
};
1830

1931
//returns a tree structure
2032
exports.getEventCollector = function(cb){
21-
var handler = new CollectingHandler({onerror: cb, onend: onend});
33+
var handler = new CollectingHandler({onparserinit: init, onerror: cb, onend: onend});
2234

2335
return handler;
2436

37+
function init(parser, parserOptions){
38+
this._parser = parser;
39+
this._parserOptions = parserOptions;
40+
}
41+
2542
function onend(){
26-
cb(null, handler.events.reduce(eventReducer, []));
43+
cb(null, eventReducer(handler.events, this._parserOptions.eagerTextCapture), false);
2744
}
2845
};
2946

30-
function eventReducer(events, arr){
31-
if(arr[0] === "onerror" || arr[0] === "onend");
32-
else if(arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){
33-
events[events.length - 1].data[0] += arr[1];
34-
} else {
35-
events.push({
36-
event: arr[0].substr(2),
37-
data: arr.slice(1)
38-
});
39-
}
47+
function eventReducer(toReduce, arr, eagerTextCapture){
48+
var events = [];
4049

50+
toReduce.forEach(function(arr){
51+
if(arr[0] === "onparserinit" || arr[0] === "onerror" || arr[0] === "onend"){
52+
return;
53+
} else if(!eagerTextCapture && arr[0] === "ontext" && events.length && events[events.length - 1].event === "text"){
54+
events[events.length - 1].data[0] += arr[1];
55+
} else {
56+
events.push({
57+
event: arr[0].substr(2),
58+
data: arr.slice(1)
59+
});
60+
}
61+
});
4162
return events;
4263
}
4364

4465
function getCallback(expected, done){
45-
var repeated = false;
66+
var repeated = 0;
4667

4768
return function(err, actual){
4869
assert.ifError(err);
@@ -51,11 +72,12 @@ function getCallback(expected, done){
5172
} catch(e){
5273
e.expected = JSON.stringify(expected, null, 2);
5374
e.actual = JSON.stringify(actual, null, 2);
75+
console.log(e.actual);
5476
throw e;
5577
}
5678

57-
if(repeated) done();
58-
else repeated = true;
79+
if(repeated === 2) done();
80+
else repeated++;
5981
};
6082
}
6183

0 commit comments

Comments
 (0)