From d24147792db8dc5a00690cbfe572d9ac12108861 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Thu, 23 Jan 2025 20:36:54 -0700 Subject: [PATCH 01/18] Adjust `md_in_html` "markdown" blocks to process content consistently Ensure `md_in_html` processes content inside a "markdown" block the same way content is processed outside of a "markdown" block. - Flatten the HTML content into placeholders so that the parser will treat the "markdown" block content in the same way it does when `md_in_html` is not enabled. The placeholders are expanded once the parser reaches them in a linear fashion. This allows extensions to deal with HTML content and consume it the same way it deals with them with them when the content is not nested under a "markdown" block. - Instead of content being processed in dummy tags, content is now processed under the real parent allowing extensions to have better context to make better decisions. --- markdown/extensions/md_in_html.py | 116 +++++++++++++++++++----------- 1 file changed, 73 insertions(+), 43 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index c162c9b3..d3e18509 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -163,6 +163,36 @@ def handle_endtag(self, tag): # If we only have one newline before block element, add another if not item.endswith('\n\n') and item.endswith('\n'): self.cleandoc.append('\n') + + # Flatten the HTML structure of "markdown" blocks such that when they + # get parsed, content will be parsed similar inside the blocks as it + # does outside the block. Having real HTML elements in the tree before + # the content adjacent content is processed can cause unpredictable + # issues for extensions. + current = element + last = [] + while current is not None: + for child in list(current): + current.remove(child) + text = current.text if current.text is not None else '' + tail = child.tail if child.tail is not None else '' + child.tail = None + state = child.attrib.get('markdown', 'off') + + # If the tail is just a new line, omit it. + if tail == '\n': + tail = '' + + # Process the block nested under the spac appropriately + if state in ('span', 'block'): + current.text = text + '\n' + self.md.htmlStash.store(child) + '\n' + tail + last.append(child) + else: + child.attrib.pop('markdown') + [c.attrib.pop('markdown', None) for c in child.iter()] + current.text = text + '\n' + self.md.htmlStash.store(child) + '\n' + tail + current = last.pop(0) if last else None + self.cleandoc.append(self.md.htmlStash.store(element)) self.cleandoc.append('\n\n') self.state = [] @@ -270,53 +300,53 @@ def parse_element_content(self, element: etree.Element) -> None: md_attr = element.attrib.pop('markdown', 'off') if md_attr == 'block': - # Parse content as block level - # The order in which the different parts are parsed (text, children, tails) is important here as the - # order of elements needs to be preserved. We can't be inserting items at a later point in the current - # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for - # example). Therefore, the order of operations is children, tails, text. - - # Recursively parse existing children from raw HTML - for child in list(element): - self.parse_element_content(child) - - # Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing. - # Save the position of each item to be inserted later in reverse. - tails = [] - for pos, child in enumerate(element): - if child.tail: - block = child.tail.rstrip('\n') - child.tail = '' - # Use a dummy placeholder element. - dummy = etree.Element('div') - self.parser.parseBlocks(dummy, block.split('\n\n')) - children = list(dummy) - children.reverse() - tails.append((pos + 1, children)) - - # Insert the elements created from the tails in reverse. - tails.reverse() - for pos, tail in tails: - for item in tail: - element.insert(pos, item) - - # Parse Markdown text content. Do this last to avoid raw HTML parsing. + # Parse the block elements content as Markdown if element.text: block = element.text.rstrip('\n') element.text = '' - # Use a dummy placeholder element as the content needs to get inserted before existing children. - dummy = etree.Element('div') - self.parser.parseBlocks(dummy, block.split('\n\n')) - children = list(dummy) - children.reverse() - for child in children: - element.insert(0, child) + self.parser.parseBlocks(element, block.split('\n\n')) elif md_attr == 'span': - # Span level parsing will be handled by inline processors. - # Walk children here to remove any `markdown` attributes. - for child in list(element): - self.parse_element_content(child) + # Span elements need to be recursively processed for block elements and raw HTML + # as their content is not normally accessed by block processors, so expand stashed + # HTML under the span. Span content itself will not be parsed here, but will await + # the inline parser. + block = element.text + element.text = '' + child = None + start = 0 + + # Search the content for HTML placeholders and process the elements + for m in util.HTML_PLACEHOLDER_RE.finditer(block): + index = int(m.group(1)) + el = self.parser.md.htmlStash.rawHtmlBlocks[index] + end = m.start() + + # Cut out the placeholder and and insert the processed element back in. + if isinstance(el, etree.Element): + if child is None: + element.text = block[start:end] + else: + child.tail = (child.tail if child.tail is not None else '') + block[start:end] + element.append(el) + self.parse_element_content(el) + child = el + self.parser.md.htmlStash.rawHtmlBlocks.pop(index) + self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') + + else: + # Not an element object, so insert content back into the element + if child is None: + element.text = block[start:end] + else: + child.tail = (child.tail if child.tail is not None else '')+ block[start:end] + start = end + + # Insert anything left after last element + if child is None: + element.text = block[start:] + else: + child.tail = (child.tail if child.tail is not None else '') + block[start:] else: # Disable inline parsing for everything else @@ -336,8 +366,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: if isinstance(element, etree.Element): # We have a matched element. Process it. blocks.pop(0) - self.parse_element_content(element) parent.append(element) + self.parse_element_content(element) # Cleanup stash. Replace element with empty string to avoid confusing postprocessor. self.parser.md.htmlStash.rawHtmlBlocks.pop(index) self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') From 606d3245a83e44281cb4aa7cbf21c49662f8f18c Mon Sep 17 00:00:00 2001 From: facelessuser Date: Thu, 23 Jan 2025 20:54:29 -0700 Subject: [PATCH 02/18] Fix lint and spelling errors --- markdown/extensions/md_in_html.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index d3e18509..480700eb 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -174,8 +174,8 @@ def handle_endtag(self, tag): while current is not None: for child in list(current): current.remove(child) - text = current.text if current.text is not None else '' - tail = child.tail if child.tail is not None else '' + text = current.text if current.text is not None else '' + tail = child.tail if child.tail is not None else '' child.tail = None state = child.attrib.get('markdown', 'off') @@ -183,7 +183,7 @@ def handle_endtag(self, tag): if tail == '\n': tail = '' - # Process the block nested under the spac appropriately + # Process the block nested under the span appropriately if state in ('span', 'block'): current.text = text + '\n' + self.md.htmlStash.store(child) + '\n' + tail last.append(child) @@ -339,7 +339,7 @@ def parse_element_content(self, element: etree.Element) -> None: if child is None: element.text = block[start:end] else: - child.tail = (child.tail if child.tail is not None else '')+ block[start:end] + child.tail = (child.tail if child.tail is not None else '') + block[start:end] start = end # Insert anything left after last element From 0702ec8343da7912665b1fa960119b7ff3555d37 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Fri, 24 Jan 2025 07:23:40 -0700 Subject: [PATCH 03/18] Slight clean up Use format strings and check for None text --- markdown/extensions/md_in_html.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 480700eb..d0aa26fc 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -180,17 +180,16 @@ def handle_endtag(self, tag): state = child.attrib.get('markdown', 'off') # If the tail is just a new line, omit it. - if tail == '\n': - tail = '' + tail = '' if tail == '\n' else '\n' + tail # Process the block nested under the span appropriately if state in ('span', 'block'): - current.text = text + '\n' + self.md.htmlStash.store(child) + '\n' + tail + current.text = f'{text}\n{self.md.htmlStash.store(child)}{tail}' last.append(child) else: child.attrib.pop('markdown') [c.attrib.pop('markdown', None) for c in child.iter()] - current.text = text + '\n' + self.md.htmlStash.store(child) + '\n' + tail + current.text = f'{text}\n{self.md.htmlStash.store(child)}{tail}' current = last.pop(0) if last else None self.cleandoc.append(self.md.htmlStash.store(element)) @@ -311,7 +310,7 @@ def parse_element_content(self, element: etree.Element) -> None: # as their content is not normally accessed by block processors, so expand stashed # HTML under the span. Span content itself will not be parsed here, but will await # the inline parser. - block = element.text + block = element.text if element.text is not None else '' element.text = '' child = None start = 0 @@ -327,7 +326,7 @@ def parse_element_content(self, element: etree.Element) -> None: if child is None: element.text = block[start:end] else: - child.tail = (child.tail if child.tail is not None else '') + block[start:end] + child.tail = f"{child.tail if child.tail is not None else ''}{block[start:end]}" element.append(el) self.parse_element_content(el) child = el @@ -339,7 +338,7 @@ def parse_element_content(self, element: etree.Element) -> None: if child is None: element.text = block[start:end] else: - child.tail = (child.tail if child.tail is not None else '') + block[start:end] + child.tail = f"{child.tail if child.tail is not None else ''}{block[start:end]}" start = end # Insert anything left after last element From c7f59c9830b3a81c0b28264778db1e811454678e Mon Sep 17 00:00:00 2001 From: facelessuser Date: Fri, 24 Jan 2025 07:27:22 -0700 Subject: [PATCH 04/18] Fix tail newline handling --- markdown/extensions/md_in_html.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index d0aa26fc..8dcd96c2 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -179,8 +179,9 @@ def handle_endtag(self, tag): child.tail = None state = child.attrib.get('markdown', 'off') - # If the tail is just a new line, omit it. - tail = '' if tail == '\n' else '\n' + tail + # Add a newline to tail if it is not just a trailing newline + if tail != '\n': + tail = '\n' + tail # Process the block nested under the span appropriately if state in ('span', 'block'): From 6ef7de4b750201a692d9941b144f6cc3e884a65a Mon Sep 17 00:00:00 2001 From: facelessuser Date: Fri, 24 Jan 2025 07:33:30 -0700 Subject: [PATCH 05/18] Adjust some comments for clarity --- .spell-dict | 2 +- markdown/extensions/md_in_html.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.spell-dict b/.spell-dict index 2fc515e1..51a6a327 100644 --- a/.spell-dict +++ b/.spell-dict @@ -178,7 +178,7 @@ plugins configs pre formatters - +unflattened dedented Setext unindented diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 8dcd96c2..6f60a661 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -188,9 +188,14 @@ def handle_endtag(self, tag): current.text = f'{text}\n{self.md.htmlStash.store(child)}{tail}' last.append(child) else: + # Non-Markdown HTML will not be recursively parsed for Markdown, + # so we can just remove markers and leave them unflattened. + # Additionally, we don't need to append to our list for further + # processing. child.attrib.pop('markdown') [c.attrib.pop('markdown', None) for c in child.iter()] current.text = f'{text}\n{self.md.htmlStash.store(child)}{tail}' + # Target the child elements that have been expanded. current = last.pop(0) if last else None self.cleandoc.append(self.md.htmlStash.store(element)) @@ -322,8 +327,9 @@ def parse_element_content(self, element: etree.Element) -> None: el = self.parser.md.htmlStash.rawHtmlBlocks[index] end = m.start() - # Cut out the placeholder and and insert the processed element back in. if isinstance(el, etree.Element): + # Replace the placeholder with the element and process it. + # Content after the placeholder should be attached to the tail. if child is None: element.text = block[start:end] else: From 8a0ecf35be8dce498804be13ca2d8cde1f21715f Mon Sep 17 00:00:00 2001 From: facelessuser Date: Fri, 24 Jan 2025 08:49:40 -0700 Subject: [PATCH 06/18] Update changelog --- docs/changelog.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index 5ca092b2..aac97fe3 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed * Backslash Unescape IDs set via `attr_list` on `toc` (#1493). +* `md_in_html` should now process content inside "markdown" blocks a similar way + as they are parsed outside of "markdown" blocks giving a more consistent + expectation to external extensions (#1503). ## [3.7] -- 2024-08-16 From df26e8966abf83dd68c465280e97878ab35f5745 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 01:15:30 -0700 Subject: [PATCH 07/18] Make sure we aren't dropping any span content --- markdown/extensions/md_in_html.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 6f60a661..5f02c4e9 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -331,28 +331,30 @@ def parse_element_content(self, element: etree.Element) -> None: # Replace the placeholder with the element and process it. # Content after the placeholder should be attached to the tail. if child is None: - element.text = block[start:end] + element.text += block[start:end] else: - child.tail = f"{child.tail if child.tail is not None else ''}{block[start:end]}" + child.tail += block[start:end] element.append(el) self.parse_element_content(el) child = el + if child.tail is None: + child.tail = '' self.parser.md.htmlStash.rawHtmlBlocks.pop(index) self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') else: # Not an element object, so insert content back into the element if child is None: - element.text = block[start:end] + element.text += block[start:end] else: - child.tail = f"{child.tail if child.tail is not None else ''}{block[start:end]}" + child.tail += block[start:end] start = end # Insert anything left after last element if child is None: - element.text = block[start:] + element.text += block[start:] else: - child.tail = (child.tail if child.tail is not None else '') + block[start:] + child.tail += block[start:] else: # Disable inline parsing for everything else From 7759343cfc0db7a7eb2599af8a383a70d7fee257 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 01:17:21 -0700 Subject: [PATCH 08/18] Adjust some white space formatting --- markdown/extensions/md_in_html.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 5f02c4e9..b8ac4dd9 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -181,11 +181,15 @@ def handle_endtag(self, tag): # Add a newline to tail if it is not just a trailing newline if tail != '\n': - tail = '\n' + tail + tail = '\n' + tail.rstrip('\n') + + # Ensure there is an empty new line between blocks + if not text.endswith('\n\n'): + text = text.rstrip('\n') + '\n\n' # Process the block nested under the span appropriately if state in ('span', 'block'): - current.text = f'{text}\n{self.md.htmlStash.store(child)}{tail}' + current.text = f'{text}{self.md.htmlStash.store(child)}{tail}' last.append(child) else: # Non-Markdown HTML will not be recursively parsed for Markdown, @@ -194,7 +198,7 @@ def handle_endtag(self, tag): # processing. child.attrib.pop('markdown') [c.attrib.pop('markdown', None) for c in child.iter()] - current.text = f'{text}\n{self.md.htmlStash.store(child)}{tail}' + current.text = f'{text}{self.md.htmlStash.store(child)}{tail}' # Target the child elements that have been expanded. current = last.pop(0) if last else None From 0cc9aa4faada875444fc986bd37f2add6dc5987d Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 08:07:44 -0700 Subject: [PATCH 09/18] Better avoidance of tags in inline code --- markdown/extensions/md_in_html.py | 9 ++++++--- markdown/inlinepatterns.py | 8 +++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index b8ac4dd9..6359f4a6 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -253,10 +253,13 @@ def handle_empty_tag(self, data, is_block): if self.inraw or not self.mdstack: super().handle_empty_tag(data, is_block) else: - if self.at_line_start() and is_block: - self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') + if self.at_line_start() or self.intail: + if is_block: + self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') + else: + self.handle_data(self.md.htmlStash.store(data)) else: - self.handle_data(self.md.htmlStash.store(data)) + self.treebuilder.data(data) def parse_pi(self, i: int) -> int: if self.at_line_start() or self.intail or self.mdstack: diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py index 3d366ad9..23f517ba 100644 --- a/markdown/inlinepatterns.py +++ b/markdown/inlinepatterns.py @@ -158,7 +158,13 @@ def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlinePro AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' """ Match an automatic email link (``). """ -HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!).)*--)>)' +HTML_RE = ( + r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag + r'!--(?:(?!).)*--|' # Comment + r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction + r')' +) """ Match an HTML tag (`<...>`). """ ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' From 494f3217e33e3e4a1e30010a90a4d2eda461ca06 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 09:53:47 -0700 Subject: [PATCH 10/18] Add tests for inline code Fix CDATA pattern --- markdown/inlinepatterns.py | 8 +- .../test_syntax/extensions/test_md_in_html.py | 109 ++++++++++++++++++ 2 files changed, 113 insertions(+), 4 deletions(-) diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py index 23f517ba..13b3c35f 100644 --- a/markdown/inlinepatterns.py +++ b/markdown/inlinepatterns.py @@ -159,10 +159,10 @@ def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlinePro """ Match an automatic email link (``). """ HTML_RE = ( - r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag - r'!--(?:(?!).)*--|' # Comment - r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction - r'@ ]*( [^<>]*)?|' # Tag + r'!--(?:(?!).)*--|' # Comment + r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction + r'!\[CDATA\[(?:(?!).)*\]\]' # `CDATA` ')>)' ) """ Match an HTML tag (`<...>`). """ diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index c439a03b..d37a39ed 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -1206,6 +1206,115 @@ def test_md1_nested_footnote_ref(self): extensions=['md_in_html', 'footnotes'] ) + def test_md1_code_void_tag(self): + + # https://github.com/Python-Markdown/markdown/issues/1075 + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: `` + +
+ +
+ + HTML: + +
+ """ + ), + '
\n' + '

Code: <label></input></label>

\n' + '
\n' + '
\n' + '

HTML:

\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_code_comment(self): + + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: `` + +
+ +
+ + HTML: + +
+ """ + ), + '
\n' + '

Code: <label><!-- **comment** --></label>

\n' + '
\n' + '
\n' + '

HTML:

\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_code_pi(self): + + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: `` + +
+ +
+ + HTML: + +
+ """ + ), + '
\n' + '

Code: <label><?php # echo \'**simple**\';?></label>

\n' + '
\n' + '
\n' + '

HTML:

\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_code_cdata(self): + + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: `` + +
+ +
+ + HTML: + +
+ """ + ), + '
\n' + '

Code: <label><![CDATA[some stuff]]></label>

\n' + '
\n' + '
\n' + '

HTML:

\n' + '
', + extensions=['md_in_html'] + ) + def load_tests(loader, tests, pattern): """ Ensure `TestHTMLBlocks` doesn't get run twice by excluding it here. """ From 724de8cd7ed9600e316b73d1e9d0063c6a788c9a Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 10:00:45 -0700 Subject: [PATCH 11/18] Update changelog in relation to fixing #1075 --- docs/changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changelog.md b/docs/changelog.md index aac97fe3..25d51698 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * `md_in_html` should now process content inside "markdown" blocks a similar way as they are parsed outside of "markdown" blocks giving a more consistent expectation to external extensions (#1503). +* `md_in_html` should not handle tags within inline code blocks better (#1075). ## [3.7] -- 2024-08-16 From 0c9a4593f8fcfd082b0382285e56ea9b52826a80 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 10:57:18 -0700 Subject: [PATCH 12/18] Handle HTML in multi-line inline code --- markdown/extensions/md_in_html.py | 11 ++--- .../test_syntax/extensions/test_md_in_html.py | 49 +++++++++++++++++-- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 6359f4a6..ed3e6711 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -253,13 +253,12 @@ def handle_empty_tag(self, data, is_block): if self.inraw or not self.mdstack: super().handle_empty_tag(data, is_block) else: - if self.at_line_start() or self.intail: - if is_block: - self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') - else: - self.handle_data(self.md.htmlStash.store(data)) + if self.at_line_start() and is_block: + self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') + elif self.mdstate and self.mdstate[-1] == "off": + self.handle_data(self.md.htmlStash.store(data)) else: - self.treebuilder.data(data) + self.handle_data(data) def parse_pi(self, i: int) -> int: if self.at_line_start() or self.intail or self.mdstack: diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index d37a39ed..061201b6 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -1214,26 +1214,67 @@ def test_md1_code_void_tag(self): """
- Code: `` + Code: ``
- HTML: + HTML:
""" ), '
\n' - '

Code: <label></input></label>

\n' + '

Code: <label><input/></label>

\n' '
\n' '
\n' - '

HTML:

\n' + '

HTML:

\n' '
', extensions=['md_in_html'] ) + def test_md1_code_void_tag_multiline(self): + + # https://github.com/Python-Markdown/markdown/issues/1075 + self.assertMarkdownRenders( + self.dedent( + """ +
+ + Code: ` + + ` + +
+ +
+ + HTML: + + +
+ """ + ), + '
\n' + '

Code: <label>\n' + '<input/>\n' + '</label>

\n' + '
\n' + '
\n' + '

HTML:\n' + '

\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_code_comment(self): self.assertMarkdownRenders( From 08a4137b5e1fea01b07d6872f6e3a69f3512938c Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 12:13:51 -0700 Subject: [PATCH 13/18] Better handling of one-liner Block HTML Fixes #1074 --- docs/changelog.md | 5 +- markdown/extensions/md_in_html.py | 20 ++++ .../test_syntax/extensions/test_md_in_html.py | 101 ++++++++++++++++++ 3 files changed, 124 insertions(+), 2 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 25d51698..204164f0 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -17,10 +17,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed * Backslash Unescape IDs set via `attr_list` on `toc` (#1493). -* `md_in_html` should now process content inside "markdown" blocks a similar way +* `md_in_html` will process content inside "markdown" blocks a similar way as they are parsed outside of "markdown" blocks giving a more consistent expectation to external extensions (#1503). -* `md_in_html` should not handle tags within inline code blocks better (#1075). +* `md_in_html` handle tags within inline code blocks better (#1075). +* `md_in_html` fix handling of one-liner block HTML handling (#1074) ## [3.7] -- 2024-08-16 diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index ed3e6711..9a2b3e43 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -61,6 +61,7 @@ def reset(self): self.mdstack: list[str] = [] # When markdown=1, stack contains a list of tags self.treebuilder = etree.TreeBuilder() self.mdstate: list[Literal['block', 'span', 'off', None]] = [] + self.mdstarted: list[bool] = [] super().reset() def close(self): @@ -126,8 +127,24 @@ def handle_starttag(self, tag, attrs): self.handle_endtag('p') self.mdstate.append(state) self.mdstack.append(tag) + self.mdstarted.append(True) attrs['markdown'] = state self.treebuilder.start(tag, attrs) + + elif not self.inraw and tag in self.block_level_tags and self.mdstarted and self.mdstarted[-1]: + # Nested one-liner block tags `...` + attrs = {key: value if value is not None else key for key, value in attrs} + state = self.get_state(tag, attrs) + if 'p' in self.mdstack and tag in self.block_level_tags: + # Close unclosed 'p' tag + self.handle_endtag('p') + self.mdstate.append(state) + self.mdstack.append(tag) + self.mdstarted.append(True) + attrs['markdown'] = state + self.treebuilder.start(tag, attrs) + return + else: # Span level tag if self.inraw: @@ -151,6 +168,7 @@ def handle_endtag(self, tag): while self.mdstack: item = self.mdstack.pop() self.mdstate.pop() + self.mdstarted.pop() self.treebuilder.end(item) if item == tag: break @@ -247,6 +265,8 @@ def handle_data(self, data): if self.inraw or not self.mdstack: super().handle_data(data) else: + for i in range(len(self.mdstarted)): + self.mdstarted[i] = False self.treebuilder.data(data) def handle_empty_tag(self, data, is_block): diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 061201b6..a443b1e9 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -1274,6 +1274,107 @@ def test_md1_code_void_tag_multiline(self): extensions=['md_in_html'] ) + def test_md1_oneliner_block(self): + self.assertMarkdownRenders( + self.dedent( + '
*foo*
' + ), + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_mixed(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ +
+ *foo* +
+ +
+ """ + ), + '
\n' + '
\n' + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_start(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ *foo* +
+ """ + ), + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_span(self): + self.assertMarkdownRenders( + self.dedent( + '
*foo*
' + ), + '
\n' + '
foo
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_span_start(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ *foo* +
+ """ + ), + '
\n' + '
\n' + 'foo\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_span_block_start(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ *foo* +
+ *foo* +
+ """ + ), + '
\n' + '
\n' + 'foo\n' + '
\n\n' + 'foo
', + extensions=['md_in_html'] + ) def test_md1_code_comment(self): From 008b8e0474a6add6da74f939bffa2760cc21926b Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 12:14:57 -0700 Subject: [PATCH 14/18] Speed up spellcheck by using 4 threads --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 31525091..768e76bf 100644 --- a/tox.ini +++ b/tox.ini @@ -30,7 +30,7 @@ extras = docs deps = pyspelling commands = {envpython} -m mkdocs build --config-file {toxinidir}/mkdocs.yml - {envpython} -m pyspelling --config {toxinidir}/.pyspelling.yml + {envpython} -m pyspelling -j 4 --config {toxinidir}/.pyspelling.yml [testenv:checklinks] extras = docs From a5cef11438c3cebe2ca27e8a7d1e58ad2c923e64 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 12:24:45 -0700 Subject: [PATCH 15/18] Link tests to related issue --- tests/test_syntax/extensions/test_md_in_html.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index a443b1e9..dfc34239 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -1275,6 +1275,7 @@ def test_md1_code_void_tag_multiline(self): ) def test_md1_oneliner_block(self): + # https://github.com/Python-Markdown/markdown/issues/1074 self.assertMarkdownRenders( self.dedent( '
*foo*
' @@ -1288,6 +1289,7 @@ def test_md1_oneliner_block(self): ) def test_md1_oneliner_block_mixed(self): + # https://github.com/Python-Markdown/markdown/issues/1074 self.assertMarkdownRenders( self.dedent( """ @@ -1313,6 +1315,7 @@ def test_md1_oneliner_block_mixed(self): ) def test_md1_oneliner_block_start(self): + # https://github.com/Python-Markdown/markdown/issues/1074 self.assertMarkdownRenders( self.dedent( """ @@ -1330,6 +1333,7 @@ def test_md1_oneliner_block_start(self): ) def test_md1_oneliner_block_span(self): + # https://github.com/Python-Markdown/markdown/issues/1074 self.assertMarkdownRenders( self.dedent( '
*foo*
' @@ -1341,6 +1345,7 @@ def test_md1_oneliner_block_span(self): ) def test_md1_oneliner_block_span_start(self): + # https://github.com/Python-Markdown/markdown/issues/1074 self.assertMarkdownRenders( self.dedent( """ @@ -1358,6 +1363,7 @@ def test_md1_oneliner_block_span_start(self): ) def test_md1_oneliner_span_block_start(self): + # https://github.com/Python-Markdown/markdown/issues/1074 self.assertMarkdownRenders( self.dedent( """ From dbddf88567e5003bc2638814940e5b98a013feb0 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 14:38:51 -0700 Subject: [PATCH 16/18] Ensure we capture continued tail cases --- markdown/extensions/md_in_html.py | 3 +- .../test_syntax/extensions/test_md_in_html.py | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 9a2b3e43..4b501001 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -265,8 +265,7 @@ def handle_data(self, data): if self.inraw or not self.mdstack: super().handle_data(data) else: - for i in range(len(self.mdstarted)): - self.mdstarted[i] = False + self.mdstarted[-1] = False self.treebuilder.data(data) def handle_empty_tag(self, data, is_block): diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index dfc34239..400e2f9f 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -1314,6 +1314,48 @@ def test_md1_oneliner_block_mixed(self): extensions=['md_in_html'] ) + def test_md1_oneliner_block_tail(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + self.dedent( + """ +
+ **foo** +
+ *bar* +
+ """ + ), + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
\n' + '
\n' + '

bar

\n' + '
\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + + def test_md1_oneliner_block_complex_start_tail(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + self.assertMarkdownRenders( + '
**foo**
' + '
*bar*
*not md*
', + '
\n' + '
\n' + '

foo

\n' + '
\n' + '
\n' + '

bar

\n' + '
\n' + '
*not md*
\n' + '
', + extensions=['md_in_html'] + ) + def test_md1_oneliner_block_start(self): # https://github.com/Python-Markdown/markdown/issues/1074 self.assertMarkdownRenders( From 4887c9e1056b9a9659edf5df4469bd477d32832d Mon Sep 17 00:00:00 2001 From: facelessuser Date: Sat, 25 Jan 2025 14:56:11 -0700 Subject: [PATCH 17/18] Consolidate logic --- markdown/extensions/md_in_html.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 4b501001..d1fbd7af 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -112,7 +112,10 @@ def handle_starttag(self, tag, attrs): self.handle_empty_tag(data, True) return - if tag in self.block_level_tags and (self.at_line_start() or self.intail): + if ( + tag in self.block_level_tags and + (self.at_line_start() or self.intail or self.mdstarted and self.mdstarted[-1]) + ): # Valueless attribute (ex: ``) results in `[('checked', None)]`. # Convert to `{'checked': 'checked'}`. attrs = {key: value if value is not None else key for key, value in attrs} @@ -131,20 +134,6 @@ def handle_starttag(self, tag, attrs): attrs['markdown'] = state self.treebuilder.start(tag, attrs) - elif not self.inraw and tag in self.block_level_tags and self.mdstarted and self.mdstarted[-1]: - # Nested one-liner block tags `...` - attrs = {key: value if value is not None else key for key, value in attrs} - state = self.get_state(tag, attrs) - if 'p' in self.mdstack and tag in self.block_level_tags: - # Close unclosed 'p' tag - self.handle_endtag('p') - self.mdstate.append(state) - self.mdstack.append(tag) - self.mdstarted.append(True) - attrs['markdown'] = state - self.treebuilder.start(tag, attrs) - return - else: # Span level tag if self.inraw: From c0cd3924b68e189b501e9d51b706de073a8aedd1 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Tue, 28 Jan 2025 07:23:12 -0700 Subject: [PATCH 18/18] Test to confirm tags following content on one line are not recognized --- tests/test_syntax/extensions/test_md_in_html.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 400e2f9f..1bdca393 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -1356,6 +1356,18 @@ def test_md1_oneliner_block_complex_start_tail(self): extensions=['md_in_html'] ) + def test_md1_oneliner_block_complex_fail(self): + # https://github.com/Python-Markdown/markdown/issues/1074 + # Nested will fail because an inline tag is only considered at the beginning if it is not preceded by text. + self.assertMarkdownRenders( + '
**strong**
**strong**
', + '
\n' + '

strong

strong

\n' + '
\n' + '
', + extensions=['md_in_html'] + ) + def test_md1_oneliner_block_start(self): # https://github.com/Python-Markdown/markdown/issues/1074 self.assertMarkdownRenders(