Skip to content

Commit fba8d45

Browse files
authored
Merge pull request #730 from mame/position-aware-stringscanner-wrapper
Refactor and improve performance of RDoc::Markup::Parser
2 parents bf4e79c + 518c432 commit fba8d45

File tree

3 files changed

+65
-67
lines changed

3 files changed

+65
-67
lines changed

lib/rdoc/markup/parser.rb

Lines changed: 59 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,6 @@ def initialize
8080
@binary_input = nil
8181
@current_token = nil
8282
@debug = false
83-
@input = nil
84-
@input_encoding = nil
85-
@line = 0
86-
@line_pos = 0
8783
@s = nil
8884
@tokens = []
8985
end
@@ -319,13 +315,6 @@ def build_verbatim margin
319315
verbatim
320316
end
321317

322-
##
323-
# The character offset for the input string at the given +byte_offset+
324-
325-
def char_pos byte_offset
326-
@input.byteslice(0, byte_offset).length
327-
end
328-
329318
##
330319
# Pulls the next token from the stream.
331320

@@ -424,15 +413,54 @@ def peek_token
424413
token
425414
end
426415

416+
##
417+
# A simple wrapper of StringScanner that is aware of the current column and lineno
418+
419+
class MyStringScanner
420+
def initialize(input)
421+
@line = @column = 0
422+
@s = StringScanner.new input
423+
end
424+
425+
def scan(re)
426+
prev_pos = @s.pos
427+
ret = @s.scan(re)
428+
@column += ret.length if ret
429+
ret
430+
end
431+
432+
def unscan(s)
433+
@s.pos -= s.bytesize
434+
@column -= s.length
435+
end
436+
437+
def pos
438+
[@column, @line]
439+
end
440+
441+
def newline!
442+
@column = 0
443+
@line += 1
444+
end
445+
446+
def eos?
447+
@s.eos?
448+
end
449+
450+
def matched
451+
@s.matched
452+
end
453+
454+
def [](i)
455+
@s[i]
456+
end
457+
end
458+
427459
##
428460
# Creates the StringScanner
429461

430462
def setup_scanner input
431-
@line = 0
432-
@line_pos = 0
433-
@input = input.dup
434-
435-
@s = StringScanner.new input
463+
@s = MyStringScanner.new input
436464
end
437465

438466
##
@@ -467,31 +495,30 @@ def tokenize input
467495
@tokens << case
468496
# [CR]LF => :NEWLINE
469497
when @s.scan(/\r?\n/) then
470-
token = [:NEWLINE, @s.matched, *token_pos(pos)]
471-
@line_pos = char_pos @s.pos
472-
@line += 1
498+
token = [:NEWLINE, @s.matched, *pos]
499+
@s.newline!
473500
token
474501
# === text => :HEADER then :TEXT
475502
when @s.scan(/(=+)(\s*)/) then
476503
level = @s[1].length
477-
header = [:HEADER, level, *token_pos(pos)]
504+
header = [:HEADER, level, *pos]
478505

479506
if @s[2] =~ /^\r?\n/ then
480-
@s.pos -= @s[2].length
507+
@s.unscan(@s[2])
481508
header
482509
else
483510
pos = @s.pos
484511
@s.scan(/.*/)
485512
@tokens << header
486-
[:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
513+
[:TEXT, @s.matched.sub(/\r$/, ''), *pos]
487514
end
488515
# --- (at least 3) and nothing else on the line => :RULE
489516
when @s.scan(/(-{3,}) *\r?$/) then
490-
[:RULE, @s[1].length - 2, *token_pos(pos)]
517+
[:RULE, @s[1].length - 2, *pos]
491518
# * or - followed by white space and text => :BULLET
492519
when @s.scan(/([*-]) +(\S)/) then
493-
@s.pos -= @s[2].bytesize # unget \S
494-
[:BULLET, @s[1], *token_pos(pos)]
520+
@s.unscan(@s[2])
521+
[:BULLET, @s[1], *pos]
495522
# A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
496523
when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
497524
# FIXME if tab(s), the column will be wrong
@@ -500,7 +527,7 @@ def tokenize input
500527
# before (and provide a check for that at least in debug
501528
# mode)
502529
list_label = @s[1]
503-
@s.pos -= @s[2].bytesize # unget \S
530+
@s.unscan(@s[2])
504531
list_type =
505532
case list_label
506533
when /[a-z]/ then :LALPHA
@@ -509,24 +536,24 @@ def tokenize input
509536
else
510537
raise ParseError, "BUG token #{list_label}"
511538
end
512-
[list_type, list_label, *token_pos(pos)]
539+
[list_type, list_label, *pos]
513540
# [text] followed by spaces or end of line => :LABEL
514541
when @s.scan(/\[(.*?)\]( +|\r?$)/) then
515-
[:LABEL, @s[1], *token_pos(pos)]
542+
[:LABEL, @s[1], *pos]
516543
# text:: followed by spaces or end of line => :NOTE
517544
when @s.scan(/(.*?)::( +|\r?$)/) then
518-
[:NOTE, @s[1], *token_pos(pos)]
545+
[:NOTE, @s[1], *pos]
519546
# >>> followed by end of line => :BLOCKQUOTE
520547
when @s.scan(/>>> *(\w+)?$/) then
521-
[:BLOCKQUOTE, @s[1], *token_pos(pos)]
548+
[:BLOCKQUOTE, @s[1], *pos]
522549
# anything else: :TEXT
523550
else
524551
@s.scan(/(.*?)( )?\r?$/)
525-
token = [:TEXT, @s[1], *token_pos(pos)]
552+
token = [:TEXT, @s[1], *pos]
526553

527554
if @s[2] then
528555
@tokens << token
529-
[:BREAK, @s[2], *token_pos(pos + @s[1].length)]
556+
[:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
530557
else
531558
token
532559
end
@@ -536,16 +563,6 @@ def tokenize input
536563
self
537564
end
538565

539-
##
540-
# Calculates the column (by character) and line of the current token based
541-
# on +byte_offset+.
542-
543-
def token_pos byte_offset
544-
offset = char_pos byte_offset
545-
546-
[offset - @line_pos, @line]
547-
end
548-
549566
##
550567
# Returns the current token to the token stream
551568

lib/rdoc/tom_doc.rb

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -242,19 +242,18 @@ def tokenize text
242242

243243
@tokens << case
244244
when @s.scan(/\r?\n/) then
245-
token = [:NEWLINE, @s.matched, *token_pos(pos)]
246-
@line_pos = char_pos @s.pos
247-
@line += 1
245+
token = [:NEWLINE, @s.matched, *pos]
246+
@s.newline!
248247
token
249248
when @s.scan(/(Examples|Signature)$/) then
250-
@tokens << [:HEADER, 3, *token_pos(pos)]
249+
@tokens << [:HEADER, 3, *pos]
251250

252-
[:TEXT, @s[1], *token_pos(pos)]
251+
[:TEXT, @s[1], *pos]
253252
when @s.scan(/([:\w][\w\[\]]*)[ ]+- /) then
254-
[:NOTE, @s[1], *token_pos(pos)]
253+
[:NOTE, @s[1], *pos]
255254
else
256255
@s.scan(/.*/)
257-
[:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
256+
[:TEXT, @s.matched.sub(/\r$/, ''), *pos]
258257
end
259258
end
260259

test/rdoc/test_rdoc_markup_parser.rb

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,6 @@ def test_build_heading
2222
assert_equal @RM::Heading.new(3, 'heading three'), parser.build_heading(3)
2323
end
2424

25-
def test_char_pos
26-
parser = @RMP.new
27-
s = parser.setup_scanner 'cät'
28-
29-
s.scan(/\S+/)
30-
31-
assert_equal 3, parser.char_pos(s.pos)
32-
end
33-
3425
def test_get
3526
parser = util_parser
3627

@@ -1646,15 +1637,6 @@ def test_tokenize_verbatim_rule_fancy
16461637
assert_equal expected, @RMP.tokenize(str)
16471638
end
16481639

1649-
def test_token_pos
1650-
parser = @RMP.new
1651-
s = parser.setup_scanner 'cät'
1652-
1653-
s.scan(/\S+/)
1654-
1655-
assert_equal [3, 0], parser.token_pos(s.pos)
1656-
end
1657-
16581640
# HACK move to Verbatim test case
16591641
def test_verbatim_normalize
16601642
v = @RM::Verbatim.new "foo\n", "\n", "\n", "bar\n"

0 commit comments

Comments
 (0)