Skip to content

Commit c93ea51

Browse files
committed
Clone Token.Characters into PendingTableCharacters
Keeps the source start/end tracking. Fixes #1927
1 parent dea4969 commit c93ea51

File tree

5 files changed

+52
-18
lines changed

5 files changed

+52
-18
lines changed

CHANGES

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ Release 1.16.1 [PENDING]
2424
again, causing errors when fetched.
2525
<https://github.com/jhy/jsoup/issues/1902>
2626

27+
* Bugfix: when tracking input source positions, text in tables that was fostered had invalid positions.
28+
<https://github.com/jhy/jsoup/issues/1927>
29+
2730
* Bugfix: If the Document.OutputSettings class was initialized, and then Entities.escape(String) called, an NPE may be
2831
thrown due to a class loading circular dependency.
2932
<https://github.com/jhy/jsoup/issues/1910>

src/main/java/org/jsoup/parser/HtmlTreeBuilder.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public class HtmlTreeBuilder extends TreeBuilder {
5353
private @Nullable Element contextElement; // fragment parse context -- could be null even if fragment parsing
5454
private ArrayList<Element> formattingElements; // active (open) formatting elements
5555
private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes
56-
private List<String> pendingTableCharacters; // chars in table to be shifted out
56+
private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out
5757
private Token.EndTag emptyEnd; // reused empty end tag
5858

5959
private boolean framesetOk; // if ok to go into frameset
@@ -676,14 +676,20 @@ void setFormElement(FormElement formElement) {
676676
this.formElement = formElement;
677677
}
678678

679-
void newPendingTableCharacters() {
679+
void resetPendingTableCharacters() {
680680
pendingTableCharacters = new ArrayList<>();
681681
}
682682

683-
List<String> getPendingTableCharacters() {
683+
List<Token.Character> getPendingTableCharacters() {
684684
return pendingTableCharacters;
685685
}
686686

687+
void addPendingTableCharacters(Token.Character c) {
688+
// make a clone of the token to maintain its state (as Tokens are otherwise reset)
689+
Token.Character clone = c.clone();
690+
pendingTableCharacters.add(clone);
691+
}
692+
687693
/**
688694
13.2.6.3 Closing elements that have implied end tags
689695
When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements.

src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import org.jsoup.nodes.Document;
77
import org.jsoup.nodes.DocumentType;
88
import org.jsoup.nodes.Element;
9-
import org.jsoup.nodes.Node;
109

1110
import java.util.ArrayList;
1211

@@ -995,7 +994,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
995994
InTable {
996995
boolean process(Token t, HtmlTreeBuilder tb) {
997996
if (t.isCharacter() && inSorted(tb.currentElement().normalName(), InTableFoster)) {
998-
tb.newPendingTableCharacters();
997+
tb.resetPendingTableCharacters();
999998
tb.markInsertionMode();
1000999
tb.transition(InTableText);
10011000
return tb.process(t);
@@ -1106,25 +1105,25 @@ boolean process(Token t, HtmlTreeBuilder tb) {
11061105
tb.error(this);
11071106
return false;
11081107
} else {
1109-
tb.getPendingTableCharacters().add(c.getData());
1108+
tb.addPendingTableCharacters(c);
11101109
}
1111-
} else {// todo - don't really like the way these table character data lists are built
1110+
} else {
11121111
if (tb.getPendingTableCharacters().size() > 0) {
1113-
for (String character : tb.getPendingTableCharacters()) {
1114-
if (!isWhitespace(character)) {
1112+
for (Token.Character c : tb.getPendingTableCharacters()) {
1113+
if (!isWhitespace(c)) {
11151114
// InTable anything else section:
11161115
tb.error(this);
11171116
if (inSorted(tb.currentElement().normalName(), InTableFoster)) {
11181117
tb.setFosterInserts(true);
1119-
tb.process(new Token.Character().data(character), InBody);
1118+
tb.process(c, InBody);
11201119
tb.setFosterInserts(false);
11211120
} else {
1122-
tb.process(new Token.Character().data(character), InBody);
1121+
tb.process(c, InBody);
11231122
}
11241123
} else
1125-
tb.insert(new Token.Character().data(character));
1124+
tb.insert(c);
11261125
}
1127-
tb.newPendingTableCharacters();
1126+
tb.resetPendingTableCharacters();
11281127
}
11291128
tb.transition(tb.originalState());
11301129
return tb.process(t);
@@ -1759,10 +1758,6 @@ private static boolean isWhitespace(Token t) {
17591758
return false;
17601759
}
17611760

1762-
private static boolean isWhitespace(String data) {
1763-
return StringUtil.isBlank(data);
1764-
}
1765-
17661761
private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) {
17671762
tb.tokeniser.transition(TokeniserState.Rcdata);
17681763
tb.markInsertionMode();

src/main/java/org/jsoup/parser/Token.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ public String toString() {
382382
}
383383
}
384384

385-
static class Character extends Token {
385+
static class Character extends Token implements Cloneable {
386386
private String data;
387387

388388
Character() {
@@ -410,6 +410,14 @@ String getData() {
410410
public String toString() {
411411
return getData();
412412
}
413+
414+
@Override protected Token.Character clone() {
415+
try {
416+
return (Token.Character) super.clone();
417+
} catch (CloneNotSupportedException e) {
418+
throw new RuntimeException(e);
419+
}
420+
}
413421
}
414422

415423
final static class CData extends Character {

src/test/java/org/jsoup/nodes/PositionTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
import org.jsoup.Jsoup;
44
import org.jsoup.integration.servlets.FileServlet;
55
import org.jsoup.parser.Parser;
6+
import org.jsoup.select.NodeTraversor;
67
import org.junit.jupiter.api.Test;
78

89
import java.io.IOException;
10+
import java.util.ArrayList;
11+
import java.util.List;
912

1013
import static org.junit.jupiter.api.Assertions.*;
1114

@@ -169,4 +172,23 @@ class PositionTest {
169172
assertEquals("17,5:779-17,12:786", item.endSourceRange().toString());
170173
}
171174

175+
@Test void tracksTableMovedText() {
176+
String html = "<table>foo<tr>bar<td>baz</td>qux</tr>coo</table>";
177+
Document doc = Jsoup.parse(html, TrackingParser);
178+
179+
List<TextNode> textNodes = new ArrayList<>();
180+
NodeTraversor.traverse((Node node, int depth) -> {
181+
if (node instanceof TextNode) {
182+
textNodes.add((TextNode) node);
183+
}
184+
}, doc);
185+
186+
assertEquals(5, textNodes.size());
187+
assertEquals("1,8:7-1,11:10", textNodes.get(0).sourceRange().toString());
188+
assertEquals("1,15:14-1,18:17", textNodes.get(1).sourceRange().toString());
189+
assertEquals("1,22:21-1,25:24", textNodes.get(2).sourceRange().toString());
190+
assertEquals("1,30:29-1,33:32", textNodes.get(3).sourceRange().toString());
191+
assertEquals("1,38:37-1,41:40", textNodes.get(4).sourceRange().toString());
192+
}
193+
172194
}

0 commit comments

Comments
 (0)