Skip to content

Adjust md_in_html "markdown" blocks to process content consistently #1503

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .spell-dict
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ plugins
configs
pre
formatters

unflattened
dedented
Setext
unindented
Expand Down
5 changes: 5 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

* Backslash Unescape IDs set via `attr_list` on `toc` (#1493).
* `md_in_html` will process content inside "markdown" blocks a similar way
as they are parsed outside of "markdown" blocks giving a more consistent
expectation to external extensions (#1503).
* `md_in_html` handle tags within inline code blocks better (#1075).
* `md_in_html` fix handling of one-liner block HTML handling (#1074)

## [3.7] -- 2024-08-16

Expand Down
142 changes: 97 additions & 45 deletions markdown/extensions/md_in_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def reset(self):
self.mdstack: list[str] = [] # When markdown=1, stack contains a list of tags
self.treebuilder = etree.TreeBuilder()
self.mdstate: list[Literal['block', 'span', 'off', None]] = []
self.mdstarted: list[bool] = []
super().reset()

def close(self):
Expand Down Expand Up @@ -111,7 +112,10 @@ def handle_starttag(self, tag, attrs):
self.handle_empty_tag(data, True)
return

if tag in self.block_level_tags and (self.at_line_start() or self.intail):
if (
tag in self.block_level_tags and
(self.at_line_start() or self.intail or self.mdstarted and self.mdstarted[-1])
):
# Valueless attribute (ex: `<tag checked>`) results in `[('checked', None)]`.
# Convert to `{'checked': 'checked'}`.
attrs = {key: value if value is not None else key for key, value in attrs}
Expand All @@ -126,8 +130,10 @@ def handle_starttag(self, tag, attrs):
self.handle_endtag('p')
self.mdstate.append(state)
self.mdstack.append(tag)
self.mdstarted.append(True)
attrs['markdown'] = state
self.treebuilder.start(tag, attrs)

else:
# Span level tag
if self.inraw:
Expand All @@ -151,6 +157,7 @@ def handle_endtag(self, tag):
while self.mdstack:
item = self.mdstack.pop()
self.mdstate.pop()
self.mdstarted.pop()
self.treebuilder.end(item)
if item == tag:
break
Expand All @@ -163,6 +170,45 @@ def handle_endtag(self, tag):
# If we only have one newline before block element, add another
if not item.endswith('\n\n') and item.endswith('\n'):
self.cleandoc.append('\n')

# Flatten the HTML structure of "markdown" blocks such that when they
# get parsed, content will be parsed similar inside the blocks as it
# does outside the block. Having real HTML elements in the tree before
# the content adjacent content is processed can cause unpredictable
# issues for extensions.
current = element
last = []
while current is not None:
for child in list(current):
current.remove(child)
text = current.text if current.text is not None else ''
tail = child.tail if child.tail is not None else ''
child.tail = None
state = child.attrib.get('markdown', 'off')

# Add a newline to tail if it is not just a trailing newline
if tail != '\n':
tail = '\n' + tail.rstrip('\n')

# Ensure there is an empty new line between blocks
if not text.endswith('\n\n'):
text = text.rstrip('\n') + '\n\n'

# Process the block nested under the span appropriately
if state in ('span', 'block'):
current.text = f'{text}{self.md.htmlStash.store(child)}{tail}'
last.append(child)
else:
# Non-Markdown HTML will not be recursively parsed for Markdown,
# so we can just remove markers and leave them unflattened.
# Additionally, we don't need to append to our list for further
# processing.
child.attrib.pop('markdown')
[c.attrib.pop('markdown', None) for c in child.iter()]
current.text = f'{text}{self.md.htmlStash.store(child)}{tail}'
# Target the child elements that have been expanded.
current = last.pop(0) if last else None

self.cleandoc.append(self.md.htmlStash.store(element))
self.cleandoc.append('\n\n')
self.state = []
Expand Down Expand Up @@ -208,6 +254,7 @@ def handle_data(self, data):
if self.inraw or not self.mdstack:
super().handle_data(data)
else:
self.mdstarted[-1] = False
self.treebuilder.data(data)

def handle_empty_tag(self, data, is_block):
Expand All @@ -216,8 +263,10 @@ def handle_empty_tag(self, data, is_block):
else:
if self.at_line_start() and is_block:
self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n')
else:
elif self.mdstate and self.mdstate[-1] == "off":
self.handle_data(self.md.htmlStash.store(data))
else:
self.handle_data(data)

def parse_pi(self, i: int) -> int:
if self.at_line_start() or self.intail or self.mdstack:
Expand Down Expand Up @@ -270,53 +319,56 @@ def parse_element_content(self, element: etree.Element) -> None:
md_attr = element.attrib.pop('markdown', 'off')

if md_attr == 'block':
# Parse content as block level
# The order in which the different parts are parsed (text, children, tails) is important here as the
# order of elements needs to be preserved. We can't be inserting items at a later point in the current
# iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
# example). Therefore, the order of operations is children, tails, text.

# Recursively parse existing children from raw HTML
for child in list(element):
self.parse_element_content(child)

# Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing.
# Save the position of each item to be inserted later in reverse.
tails = []
for pos, child in enumerate(element):
if child.tail:
block = child.tail.rstrip('\n')
child.tail = ''
# Use a dummy placeholder element.
dummy = etree.Element('div')
self.parser.parseBlocks(dummy, block.split('\n\n'))
children = list(dummy)
children.reverse()
tails.append((pos + 1, children))

# Insert the elements created from the tails in reverse.
tails.reverse()
for pos, tail in tails:
for item in tail:
element.insert(pos, item)

# Parse Markdown text content. Do this last to avoid raw HTML parsing.
# Parse the block elements content as Markdown
if element.text:
block = element.text.rstrip('\n')
element.text = ''
# Use a dummy placeholder element as the content needs to get inserted before existing children.
dummy = etree.Element('div')
self.parser.parseBlocks(dummy, block.split('\n\n'))
children = list(dummy)
children.reverse()
for child in children:
element.insert(0, child)
self.parser.parseBlocks(element, block.split('\n\n'))

elif md_attr == 'span':
# Span level parsing will be handled by inline processors.
# Walk children here to remove any `markdown` attributes.
for child in list(element):
self.parse_element_content(child)
# Span elements need to be recursively processed for block elements and raw HTML
# as their content is not normally accessed by block processors, so expand stashed
# HTML under the span. Span content itself will not be parsed here, but will await
# the inline parser.
block = element.text if element.text is not None else ''
element.text = ''
child = None
start = 0

# Search the content for HTML placeholders and process the elements
for m in util.HTML_PLACEHOLDER_RE.finditer(block):
index = int(m.group(1))
el = self.parser.md.htmlStash.rawHtmlBlocks[index]
end = m.start()

if isinstance(el, etree.Element):
# Replace the placeholder with the element and process it.
# Content after the placeholder should be attached to the tail.
if child is None:
element.text += block[start:end]
else:
child.tail += block[start:end]
element.append(el)
self.parse_element_content(el)
child = el
if child.tail is None:
child.tail = ''
self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')

else:
# Not an element object, so insert content back into the element
if child is None:
element.text += block[start:end]
else:
child.tail += block[start:end]
start = end

# Insert anything left after last element
if child is None:
element.text += block[start:]
else:
child.tail += block[start:]

else:
# Disable inline parsing for everything else
Expand All @@ -336,8 +388,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
if isinstance(element, etree.Element):
# We have a matched element. Process it.
blocks.pop(0)
self.parse_element_content(element)
parent.append(element)
self.parse_element_content(element)
# Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
Expand Down
8 changes: 7 additions & 1 deletion markdown/inlinepatterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,13 @@ def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlinePro
AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
""" Match an automatic email link (`<[email protected]>`). """

HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)'
HTML_RE = (
r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag
r'!--(?:(?!<!--|-->).)*--|' # Comment
r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction
r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]' # `CDATA`
')>)'
)
""" Match an HTML tag (`<...>`). """

ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
Expand Down
Loading
Loading