diff --git a/.spell-dict b/.spell-dict index 2fc515e1..51a6a327 100644 --- a/.spell-dict +++ b/.spell-dict @@ -178,7 +178,7 @@ plugins configs pre formatters - +unflattened dedented Setext unindented diff --git a/docs/changelog.md b/docs/changelog.md index 5ca092b2..204164f0 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -17,6 +17,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed * Backslash Unescape IDs set via `attr_list` on `toc` (#1493). +* `md_in_html` will process content inside "markdown" blocks a similar way + as they are parsed outside of "markdown" blocks giving a more consistent + expectation to external extensions (#1503). +* `md_in_html` handle tags within inline code blocks better (#1075). +* `md_in_html` fix handling of one-liner block HTML handling (#1074) ## [3.7] -- 2024-08-16 diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index c162c9b3..d1fbd7af 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -61,6 +61,7 @@ def reset(self): self.mdstack: list[str] = [] # When markdown=1, stack contains a list of tags self.treebuilder = etree.TreeBuilder() self.mdstate: list[Literal['block', 'span', 'off', None]] = [] + self.mdstarted: list[bool] = [] super().reset() def close(self): @@ -111,7 +112,10 @@ def handle_starttag(self, tag, attrs): self.handle_empty_tag(data, True) return - if tag in self.block_level_tags and (self.at_line_start() or self.intail): + if ( + tag in self.block_level_tags and + (self.at_line_start() or self.intail or self.mdstarted and self.mdstarted[-1]) + ): # Valueless attribute (ex: ``) results in `[('checked', None)]`. # Convert to `{'checked': 'checked'}`. attrs = {key: value if value is not None else key for key, value in attrs} @@ -126,8 +130,10 @@ def handle_starttag(self, tag, attrs): self.handle_endtag('p') self.mdstate.append(state) self.mdstack.append(tag) + self.mdstarted.append(True) attrs['markdown'] = state self.treebuilder.start(tag, attrs) + else: # Span level tag if self.inraw: @@ -151,6 +157,7 @@ def handle_endtag(self, tag): while self.mdstack: item = self.mdstack.pop() self.mdstate.pop() + self.mdstarted.pop() self.treebuilder.end(item) if item == tag: break @@ -163,6 +170,45 @@ def handle_endtag(self, tag): # If we only have one newline before block element, add another if not item.endswith('\n\n') and item.endswith('\n'): self.cleandoc.append('\n') + + # Flatten the HTML structure of "markdown" blocks such that when they + # get parsed, content will be parsed similar inside the blocks as it + # does outside the block. Having real HTML elements in the tree before + # the content adjacent content is processed can cause unpredictable + # issues for extensions. + current = element + last = [] + while current is not None: + for child in list(current): + current.remove(child) + text = current.text if current.text is not None else '' + tail = child.tail if child.tail is not None else '' + child.tail = None + state = child.attrib.get('markdown', 'off') + + # Add a newline to tail if it is not just a trailing newline + if tail != '\n': + tail = '\n' + tail.rstrip('\n') + + # Ensure there is an empty new line between blocks + if not text.endswith('\n\n'): + text = text.rstrip('\n') + '\n\n' + + # Process the block nested under the span appropriately + if state in ('span', 'block'): + current.text = f'{text}{self.md.htmlStash.store(child)}{tail}' + last.append(child) + else: + # Non-Markdown HTML will not be recursively parsed for Markdown, + # so we can just remove markers and leave them unflattened. + # Additionally, we don't need to append to our list for further + # processing. + child.attrib.pop('markdown') + [c.attrib.pop('markdown', None) for c in child.iter()] + current.text = f'{text}{self.md.htmlStash.store(child)}{tail}' + # Target the child elements that have been expanded. + current = last.pop(0) if last else None + self.cleandoc.append(self.md.htmlStash.store(element)) self.cleandoc.append('\n\n') self.state = [] @@ -208,6 +254,7 @@ def handle_data(self, data): if self.inraw or not self.mdstack: super().handle_data(data) else: + self.mdstarted[-1] = False self.treebuilder.data(data) def handle_empty_tag(self, data, is_block): @@ -216,8 +263,10 @@ def handle_empty_tag(self, data, is_block): else: if self.at_line_start() and is_block: self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') - else: + elif self.mdstate and self.mdstate[-1] == "off": self.handle_data(self.md.htmlStash.store(data)) + else: + self.handle_data(data) def parse_pi(self, i: int) -> int: if self.at_line_start() or self.intail or self.mdstack: @@ -270,53 +319,56 @@ def parse_element_content(self, element: etree.Element) -> None: md_attr = element.attrib.pop('markdown', 'off') if md_attr == 'block': - # Parse content as block level - # The order in which the different parts are parsed (text, children, tails) is important here as the - # order of elements needs to be preserved. We can't be inserting items at a later point in the current - # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for - # example). Therefore, the order of operations is children, tails, text. - - # Recursively parse existing children from raw HTML - for child in list(element): - self.parse_element_content(child) - - # Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing. - # Save the position of each item to be inserted later in reverse. - tails = [] - for pos, child in enumerate(element): - if child.tail: - block = child.tail.rstrip('\n') - child.tail = '' - # Use a dummy placeholder element. - dummy = etree.Element('div') - self.parser.parseBlocks(dummy, block.split('\n\n')) - children = list(dummy) - children.reverse() - tails.append((pos + 1, children)) - - # Insert the elements created from the tails in reverse. - tails.reverse() - for pos, tail in tails: - for item in tail: - element.insert(pos, item) - - # Parse Markdown text content. Do this last to avoid raw HTML parsing. + # Parse the block elements content as Markdown if element.text: block = element.text.rstrip('\n') element.text = '' - # Use a dummy placeholder element as the content needs to get inserted before existing children. - dummy = etree.Element('div') - self.parser.parseBlocks(dummy, block.split('\n\n')) - children = list(dummy) - children.reverse() - for child in children: - element.insert(0, child) + self.parser.parseBlocks(element, block.split('\n\n')) elif md_attr == 'span': - # Span level parsing will be handled by inline processors. - # Walk children here to remove any `markdown` attributes. - for child in list(element): - self.parse_element_content(child) + # Span elements need to be recursively processed for block elements and raw HTML + # as their content is not normally accessed by block processors, so expand stashed + # HTML under the span. Span content itself will not be parsed here, but will await + # the inline parser. + block = element.text if element.text is not None else '' + element.text = '' + child = None + start = 0 + + # Search the content for HTML placeholders and process the elements + for m in util.HTML_PLACEHOLDER_RE.finditer(block): + index = int(m.group(1)) + el = self.parser.md.htmlStash.rawHtmlBlocks[index] + end = m.start() + + if isinstance(el, etree.Element): + # Replace the placeholder with the element and process it. + # Content after the placeholder should be attached to the tail. + if child is None: + element.text += block[start:end] + else: + child.tail += block[start:end] + element.append(el) + self.parse_element_content(el) + child = el + if child.tail is None: + child.tail = '' + self.parser.md.htmlStash.rawHtmlBlocks.pop(index) + self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') + + else: + # Not an element object, so insert content back into the element + if child is None: + element.text += block[start:end] + else: + child.tail += block[start:end] + start = end + + # Insert anything left after last element + if child is None: + element.text += block[start:] + else: + child.tail += block[start:] else: # Disable inline parsing for everything else @@ -336,8 +388,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: if isinstance(element, etree.Element): # We have a matched element. Process it. blocks.pop(0) - self.parse_element_content(element) parent.append(element) + self.parse_element_content(element) # Cleanup stash. Replace element with empty string to avoid confusing postprocessor. self.parser.md.htmlStash.rawHtmlBlocks.pop(index) self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py index 3d366ad9..13b3c35f 100644 --- a/markdown/inlinepatterns.py +++ b/markdown/inlinepatterns.py @@ -158,7 +158,13 @@ def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlinePro AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' """ Match an automatic email link (``). """ -HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!).)*--)>)' +HTML_RE = ( + r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag + r'!--(?:(?!).)*--|' # Comment + r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction + r'!\[CDATA\[(?:(?!).)*\]\]' # `CDATA` + ')>)' +) """ Match an HTML tag (`<...>`). """ ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index c439a03b..1bdca393 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -1206,6 +1206,317 @@ def test_md1_nested_footnote_ref(self): extensions=['md_in_html', 'footnotes'] ) + def test_md1_code_void_tag(self): + + # https://github.com/Python-Markdown/markdown/issues/1075 + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: `` + +
+ +
+ + HTML: + +
+ """ + ), + '
\n' + '

Code: <label><input/></label>

\n' + '
\n' + '
\n' + '

HTML:

\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_code_void_tag_multiline(self): + + # https://github.com/Python-Markdown/markdown/issues/1075 + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: ` + + ` + +
+ +
+ + HTML: + + +
+ """ + ), + '
\n' + '

Code: <label>\n' + '<input/>\n' + '</label>

\n' + '
\n' + '
\n' + '

HTML:\n' + '

\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + self.dedent( + '
*foo*
' + ), + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_mixed(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + self.dedent( + """ +
+ +
+ *foo* +
+ +
+ """ + ), + '
\n' + '
\n' + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_tail(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + self.dedent( + """ +
+ **foo** +
+ *bar* +
+ """ + ), + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
\n' + '
\n' + '

bar

\n' + '
\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_complex_start_tail(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + '
**foo**
' + '
*bar*
*not md*
', + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
\n' + '

bar

\n' + '
\n' + '
*not md*
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_complex_fail(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + # Nested will fail because an inline tag is only considered at the beginning if it is not preceded by text. + self.assertMarkdownRenders( + '
**strong**
**strong**
', + '
\n' + '

strong

strong

\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_start(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + self.dedent( + """ +
+ *foo* +
+ """ + ), + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_span(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + self.dedent( + '
*foo*
' + ), + '
\n' + '
foo
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_span_start(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + self.dedent( + """ +
+ *foo* +
+ """ + ), + '
\n' + '
\n' + 'foo\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_span_block_start(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + self.dedent( + """ +
+ *foo* +
+ *foo* +
+ """ + ), + '
\n' + '
\n' + 'foo\n' + '
\n\n' + 'foo
', + extensions=['md_in_html'] + ) + + def test_md1_code_comment(self): + + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: `` + +
+ +
+ + HTML: + +
+ """ + ), + '
\n' + '

Code: <label><!-- **comment** --></label>

\n' + '
\n' + '
\n' + '

HTML:

\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_code_pi(self): + + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: `` + +
+ +
+ + HTML: + +
+ """ + ), + '
\n' + '

Code: <label><?php # echo \'**simple**\';?></label>

\n' + '
\n' + '
\n' + '

HTML:

\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_code_cdata(self): + + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: `` + +
+ +
+ + HTML: + +
+ """ + ), + '
\n' + '

Code: <label><![CDATA[some stuff]]></label>

\n' + '
\n' + '
\n' + '

HTML:

\n' + '
', + extensions=['md_in_html'] + ) + def load_tests(loader, tests, pattern): """ Ensure `TestHTMLBlocks` doesn't get run twice by excluding it here. """ diff --git a/tox.ini b/tox.ini index 31525091..768e76bf 100644 --- a/tox.ini +++ b/tox.ini @@ -30,7 +30,7 @@ extras = docs deps = pyspelling commands = {envpython} -m mkdocs build --config-file {toxinidir}/mkdocs.yml - {envpython} -m pyspelling --config {toxinidir}/.pyspelling.yml + {envpython} -m pyspelling -j 4 --config {toxinidir}/.pyspelling.yml [testenv:checklinks] extras = docs