Skip to content

Refactor and improve performance of RDoc::Markup::Parser #730

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 59 additions & 42 deletions lib/rdoc/markup/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,6 @@ def initialize
@binary_input = nil
@current_token = nil
@debug = false
@input = nil
@input_encoding = nil
@line = 0
@line_pos = 0
@s = nil
@tokens = []
end
Expand Down Expand Up @@ -319,13 +315,6 @@ def build_verbatim margin
verbatim
end

##
# The character offset for the input string at the given +byte_offset+

def char_pos byte_offset
@input.byteslice(0, byte_offset).length
end

##
# Pulls the next token from the stream.

Expand Down Expand Up @@ -424,15 +413,54 @@ def peek_token
token
end

##
# A simple wrapper of StringScanner that is aware of the current column and lineno

class MyStringScanner
def initialize(input)
@line = @column = 0
@s = StringScanner.new input
end

def scan(re)
prev_pos = @s.pos
ret = @s.scan(re)
@column += ret.length if ret
ret
end

def unscan(s)
@s.pos -= s.bytesize
@column -= s.length
end

def pos
[@column, @line]
end

def newline!
@column = 0
@line += 1
end

def eos?
@s.eos?
end

def matched
@s.matched
end

def [](i)
@s[i]
end
end

##
# Creates the StringScanner

def setup_scanner input
@line = 0
@line_pos = 0
@input = input.dup

@s = StringScanner.new input
@s = MyStringScanner.new input
end

##
Expand Down Expand Up @@ -467,31 +495,30 @@ def tokenize input
@tokens << case
# [CR]LF => :NEWLINE
when @s.scan(/\r?\n/) then
token = [:NEWLINE, @s.matched, *token_pos(pos)]
@line_pos = char_pos @s.pos
@line += 1
token = [:NEWLINE, @s.matched, *pos]
@s.newline!
token
# === text => :HEADER then :TEXT
when @s.scan(/(=+)(\s*)/) then
level = @s[1].length
header = [:HEADER, level, *token_pos(pos)]
header = [:HEADER, level, *pos]

if @s[2] =~ /^\r?\n/ then
@s.pos -= @s[2].length
@s.unscan(@s[2])
header
else
pos = @s.pos
@s.scan(/.*/)
@tokens << header
[:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
[:TEXT, @s.matched.sub(/\r$/, ''), *pos]
end
# --- (at least 3) and nothing else on the line => :RULE
when @s.scan(/(-{3,}) *\r?$/) then
[:RULE, @s[1].length - 2, *token_pos(pos)]
[:RULE, @s[1].length - 2, *pos]
# * or - followed by white space and text => :BULLET
when @s.scan(/([*-]) +(\S)/) then
@s.pos -= @s[2].bytesize # unget \S
[:BULLET, @s[1], *token_pos(pos)]
@s.unscan(@s[2])
[:BULLET, @s[1], *pos]
# A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
# FIXME if tab(s), the column will be wrong
Expand All @@ -500,7 +527,7 @@ def tokenize input
# before (and provide a check for that at least in debug
# mode)
list_label = @s[1]
@s.pos -= @s[2].bytesize # unget \S
@s.unscan(@s[2])
list_type =
case list_label
when /[a-z]/ then :LALPHA
Expand All @@ -509,24 +536,24 @@ def tokenize input
else
raise ParseError, "BUG token #{list_label}"
end
[list_type, list_label, *token_pos(pos)]
[list_type, list_label, *pos]
# [text] followed by spaces or end of line => :LABEL
when @s.scan(/\[(.*?)\]( +|\r?$)/) then
[:LABEL, @s[1], *token_pos(pos)]
[:LABEL, @s[1], *pos]
# text:: followed by spaces or end of line => :NOTE
when @s.scan(/(.*?)::( +|\r?$)/) then
[:NOTE, @s[1], *token_pos(pos)]
[:NOTE, @s[1], *pos]
# >>> followed by end of line => :BLOCKQUOTE
when @s.scan(/>>> *(\w+)?$/) then
[:BLOCKQUOTE, @s[1], *token_pos(pos)]
[:BLOCKQUOTE, @s[1], *pos]
# anything else: :TEXT
else
@s.scan(/(.*?)( )?\r?$/)
token = [:TEXT, @s[1], *token_pos(pos)]
token = [:TEXT, @s[1], *pos]

if @s[2] then
@tokens << token
[:BREAK, @s[2], *token_pos(pos + @s[1].length)]
[:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
else
token
end
Expand All @@ -536,16 +563,6 @@ def tokenize input
self
end

##
# Calculates the column (by character) and line of the current token based
# on +byte_offset+.

def token_pos byte_offset
offset = char_pos byte_offset

[offset - @line_pos, @line]
end

##
# Returns the current token to the token stream

Expand Down
13 changes: 6 additions & 7 deletions lib/rdoc/tom_doc.rb
Original file line number Diff line number Diff line change
Expand Up @@ -242,19 +242,18 @@ def tokenize text

@tokens << case
when @s.scan(/\r?\n/) then
token = [:NEWLINE, @s.matched, *token_pos(pos)]
@line_pos = char_pos @s.pos
@line += 1
token = [:NEWLINE, @s.matched, *pos]
@s.newline!
token
when @s.scan(/(Examples|Signature)$/) then
@tokens << [:HEADER, 3, *token_pos(pos)]
@tokens << [:HEADER, 3, *pos]

[:TEXT, @s[1], *token_pos(pos)]
[:TEXT, @s[1], *pos]
when @s.scan(/([:\w][\w\[\]]*)[ ]+- /) then
[:NOTE, @s[1], *token_pos(pos)]
[:NOTE, @s[1], *pos]
else
@s.scan(/.*/)
[:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
[:TEXT, @s.matched.sub(/\r$/, ''), *pos]
end
end

Expand Down
18 changes: 0 additions & 18 deletions test/rdoc/test_rdoc_markup_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,6 @@ def test_build_heading
assert_equal @RM::Heading.new(3, 'heading three'), parser.build_heading(3)
end

def test_char_pos
parser = @RMP.new
s = parser.setup_scanner 'cät'

s.scan(/\S+/)

assert_equal 3, parser.char_pos(s.pos)
end

def test_get
parser = util_parser

Expand Down Expand Up @@ -1647,15 +1638,6 @@ def test_tokenize_verbatim_rule_fancy
assert_equal expected, @RMP.tokenize(str)
end

def test_token_pos
parser = @RMP.new
s = parser.setup_scanner 'cät'

s.scan(/\S+/)

assert_equal [3, 0], parser.token_pos(s.pos)
end

# HACK move to Verbatim test case
def test_verbatim_normalize
v = @RM::Verbatim.new "foo\n", "\n", "\n", "bar\n"
Expand Down