Switch back to a preprocessor

waylan · waylan · commit 7a8a6b5f9f6f · 2020-07-01T13:10:05.000-04:00
diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py
@@ -46,7 +46,6 @@ def build_block_parser(md, **kwargs):
     parser.blockprocessors.register(EmptyBlockProcessor(parser), 'empty', 100)
     parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 90)
     parser.blockprocessors.register(CodeBlockProcessor(parser), 'code', 80)
-    parser.blockprocessors.register(RawHtmlProcessor(parser), 'html', 75)
     parser.blockprocessors.register(HashHeaderProcessor(parser), 'hashheader', 70)
     parser.blockprocessors.register(SetextHeaderProcessor(parser), 'setextheader', 60)
     parser.blockprocessors.register(HRProcessor(parser), 'hr', 50)
@@ -273,29 +272,6 @@ def run(self, parent, blocks):
             blocks.insert(0, theRest)
 
 
-class RawHtmlProcessor(BlockProcessor):
-
-    TAG_RE = re.compile(r'(^|\n)[ ]{0,3}<([?!].*?|(?P<tag>[^<> ]+)[^<>]*)>', re.S | re.U)
-
-    def test(self, parent, block):
-        m = self.TAG_RE.search(block)
-        # If m but no 'tag', then we have a comment, declaration, or processing instruction.
-        return m and (self.parser.md.is_block_level(m.group('tag')) or not m.group('tag'))
-
-    def run(self, parent, blocks):
-        parser = HTMLExtractor(md=self.parser.md)
-        while blocks:
-            parser.feed(blocks.pop(0) + '\n\n')
-            if not parser.inraw:
-                break
-        parser.close()
-        # Insert Markdown back into blocks with raw HTML extracted.
-        parts = ''.join(parser.cleandoc).split('\n\n')
-        parts.reverse()
-        for block in parts:
-            blocks.insert(0, block)
-
-
 class BlockQuoteProcessor(BlockProcessor):
 
     RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py
@@ -26,13 +26,15 @@
 """
 
 from . import util
+from .htmlparser import HTMLExtractor
 import re
 
 
 def build_preprocessors(md, **kwargs):
     """ Build the default set of preprocessors used by Markdown. """
     preprocessors = util.Registry()
     preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)
+    preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
     preprocessors.register(ReferencePreprocessor(md), 'reference', 10)
     return preprocessors
 
@@ -70,6 +72,17 @@ def run(self, lines):
         return source.split('\n')
 
 
+class HtmlBlockPreprocessor(Preprocessor):
+    """Remove html blocks from the text and store them for later retrieval."""
+
+    def run(self, lines):
+        source = '\n'.join(lines)
+        parser = HTMLExtractor(md=self.md)
+        parser.feed(source)
+        parser.close()
+        return ''.join(parser.cleandoc).split('\n')
+
+
 class ReferencePreprocessor(Preprocessor):
     """ Remove reference definitions from text and store for later use. """