Skip to content

Commit 518c432

Browse files
committed
Refactor and improve performance of RDoc::Markup::Parser
This change introduces a wrapper of StringScanner that is aware of the current position (column and lineno). It has two advantages: faster and more modular. The old code frequently runs `@input.byteslice(0, byte_offset).length` to get the current position, but it was painfully slow. This change keeps track of the position at each scan, which reduces about half of time of "Generating RI format into ..." in Ruby's `make rdoc` (5.5 sec -> 3.0 sec). And the old code used four instance variables (`@input`, `@line`, `@line_pos`, and `@s`) to track the position. This change factors them out into MyStringScanner, so now only one variable (`@s`) is needed.
1 parent 09a0c91 commit 518c432

File tree

3 files changed

+65
-67
lines changed

3 files changed

+65
-67
lines changed

lib/rdoc/markup/parser.rb

Lines changed: 59 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,6 @@ def initialize
8080
@binary_input = nil
8181
@current_token = nil
8282
@debug = false
83-
@input = nil
84-
@input_encoding = nil
85-
@line = 0
86-
@line_pos = 0
8783
@s = nil
8884
@tokens = []
8985
end
@@ -319,13 +315,6 @@ def build_verbatim margin
319315
verbatim
320316
end
321317

322-
##
323-
# The character offset for the input string at the given +byte_offset+
324-
325-
def char_pos byte_offset
326-
@input.byteslice(0, byte_offset).length
327-
end
328-
329318
##
330319
# Pulls the next token from the stream.
331320

@@ -424,15 +413,54 @@ def peek_token
424413
token
425414
end
426415

416+
##
417+
# A simple wrapper of StringScanner that is aware of the current column and lineno
418+
419+
class MyStringScanner
420+
def initialize(input)
421+
@line = @column = 0
422+
@s = StringScanner.new input
423+
end
424+
425+
def scan(re)
426+
prev_pos = @s.pos
427+
ret = @s.scan(re)
428+
@column += ret.length if ret
429+
ret
430+
end
431+
432+
def unscan(s)
433+
@s.pos -= s.bytesize
434+
@column -= s.length
435+
end
436+
437+
def pos
438+
[@column, @line]
439+
end
440+
441+
def newline!
442+
@column = 0
443+
@line += 1
444+
end
445+
446+
def eos?
447+
@s.eos?
448+
end
449+
450+
def matched
451+
@s.matched
452+
end
453+
454+
def [](i)
455+
@s[i]
456+
end
457+
end
458+
427459
##
428460
# Creates the StringScanner
429461

430462
def setup_scanner input
431-
@line = 0
432-
@line_pos = 0
433-
@input = input.dup
434-
435-
@s = StringScanner.new input
463+
@s = MyStringScanner.new input
436464
end
437465

438466
##
@@ -467,31 +495,30 @@ def tokenize input
467495
@tokens << case
468496
# [CR]LF => :NEWLINE
469497
when @s.scan(/\r?\n/) then
470-
token = [:NEWLINE, @s.matched, *token_pos(pos)]
471-
@line_pos = char_pos @s.pos
472-
@line += 1
498+
token = [:NEWLINE, @s.matched, *pos]
499+
@s.newline!
473500
token
474501
# === text => :HEADER then :TEXT
475502
when @s.scan(/(=+)(\s*)/) then
476503
level = @s[1].length
477-
header = [:HEADER, level, *token_pos(pos)]
504+
header = [:HEADER, level, *pos]
478505

479506
if @s[2] =~ /^\r?\n/ then
480-
@s.pos -= @s[2].length
507+
@s.unscan(@s[2])
481508
header
482509
else
483510
pos = @s.pos
484511
@s.scan(/.*/)
485512
@tokens << header
486-
[:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
513+
[:TEXT, @s.matched.sub(/\r$/, ''), *pos]
487514
end
488515
# --- (at least 3) and nothing else on the line => :RULE
489516
when @s.scan(/(-{3,}) *\r?$/) then
490-
[:RULE, @s[1].length - 2, *token_pos(pos)]
517+
[:RULE, @s[1].length - 2, *pos]
491518
# * or - followed by white space and text => :BULLET
492519
when @s.scan(/([*-]) +(\S)/) then
493-
@s.pos -= @s[2].bytesize # unget \S
494-
[:BULLET, @s[1], *token_pos(pos)]
520+
@s.unscan(@s[2])
521+
[:BULLET, @s[1], *pos]
495522
# A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
496523
when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
497524
# FIXME if tab(s), the column will be wrong
@@ -500,7 +527,7 @@ def tokenize input
500527
# before (and provide a check for that at least in debug
501528
# mode)
502529
list_label = @s[1]
503-
@s.pos -= @s[2].bytesize # unget \S
530+
@s.unscan(@s[2])
504531
list_type =
505532
case list_label
506533
when /[a-z]/ then :LALPHA
@@ -509,24 +536,24 @@ def tokenize input
509536
else
510537
raise ParseError, "BUG token #{list_label}"
511538
end
512-
[list_type, list_label, *token_pos(pos)]
539+
[list_type, list_label, *pos]
513540
# [text] followed by spaces or end of line => :LABEL
514541
when @s.scan(/\[(.*?)\]( +|\r?$)/) then
515-
[:LABEL, @s[1], *token_pos(pos)]
542+
[:LABEL, @s[1], *pos]
516543
# text:: followed by spaces or end of line => :NOTE
517544
when @s.scan(/(.*?)::( +|\r?$)/) then
518-
[:NOTE, @s[1], *token_pos(pos)]
545+
[:NOTE, @s[1], *pos]
519546
# >>> followed by end of line => :BLOCKQUOTE
520547
when @s.scan(/>>> *(\w+)?$/) then
521-
[:BLOCKQUOTE, @s[1], *token_pos(pos)]
548+
[:BLOCKQUOTE, @s[1], *pos]
522549
# anything else: :TEXT
523550
else
524551
@s.scan(/(.*?)( )?\r?$/)
525-
token = [:TEXT, @s[1], *token_pos(pos)]
552+
token = [:TEXT, @s[1], *pos]
526553

527554
if @s[2] then
528555
@tokens << token
529-
[:BREAK, @s[2], *token_pos(pos + @s[1].length)]
556+
[:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
530557
else
531558
token
532559
end
@@ -536,16 +563,6 @@ def tokenize input
536563
self
537564
end
538565

539-
##
540-
# Calculates the column (by character) and line of the current token based
541-
# on +byte_offset+.
542-
543-
def token_pos byte_offset
544-
offset = char_pos byte_offset
545-
546-
[offset - @line_pos, @line]
547-
end
548-
549566
##
550567
# Returns the current token to the token stream
551568

lib/rdoc/tom_doc.rb

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -242,19 +242,18 @@ def tokenize text
242242

243243
@tokens << case
244244
when @s.scan(/\r?\n/) then
245-
token = [:NEWLINE, @s.matched, *token_pos(pos)]
246-
@line_pos = char_pos @s.pos
247-
@line += 1
245+
token = [:NEWLINE, @s.matched, *pos]
246+
@s.newline!
248247
token
249248
when @s.scan(/(Examples|Signature)$/) then
250-
@tokens << [:HEADER, 3, *token_pos(pos)]
249+
@tokens << [:HEADER, 3, *pos]
251250

252-
[:TEXT, @s[1], *token_pos(pos)]
251+
[:TEXT, @s[1], *pos]
253252
when @s.scan(/([:\w][\w\[\]]*)[ ]+- /) then
254-
[:NOTE, @s[1], *token_pos(pos)]
253+
[:NOTE, @s[1], *pos]
255254
else
256255
@s.scan(/.*/)
257-
[:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
256+
[:TEXT, @s.matched.sub(/\r$/, ''), *pos]
258257
end
259258
end
260259

test/rdoc/test_rdoc_markup_parser.rb

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,6 @@ def test_build_heading
2222
assert_equal @RM::Heading.new(3, 'heading three'), parser.build_heading(3)
2323
end
2424

25-
def test_char_pos
26-
parser = @RMP.new
27-
s = parser.setup_scanner 'cät'
28-
29-
s.scan(/\S+/)
30-
31-
assert_equal 3, parser.char_pos(s.pos)
32-
end
33-
3425
def test_get
3526
parser = util_parser
3627

@@ -1647,15 +1638,6 @@ def test_tokenize_verbatim_rule_fancy
16471638
assert_equal expected, @RMP.tokenize(str)
16481639
end
16491640

1650-
def test_token_pos
1651-
parser = @RMP.new
1652-
s = parser.setup_scanner 'cät'
1653-
1654-
s.scan(/\S+/)
1655-
1656-
assert_equal [3, 0], parser.token_pos(s.pos)
1657-
end
1658-
16591641
# HACK move to Verbatim test case
16601642
def test_verbatim_normalize
16611643
v = @RM::Verbatim.new "foo\n", "\n", "\n", "bar\n"

0 commit comments

Comments
 (0)