Skip to content

Use MathML attributes for PDFs read in Adobe Acrobat #17984

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 64 additions & 29 deletions source/NVDAObjects/IAccessible/adobeAcrobat.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,61 +119,96 @@ def _isEqual(self, other):
return self.accID == other.accID
return super(AcrobatNode, self)._isEqual(other)

def _getNodeMathMl(self, node):
@staticmethod
def getMathMLAttributes(node: IPDDomElement, attrList: list) -> str:
"""Get the MathML attributes in 'attrList' for a 'node' (MathML element)."""
attrValues = ""
for attr in attrList:
# "NSO" comes from the PDF spec
val = node.GetAttribute(attr, "NSO")
if val:
attrValues += f' {attr}="{val}"'
return attrValues

def _getNodeMathMl(self, node: IPDDomElement) -> str:
"""Traverse the MathML tree and return an XML string representing the math"""

tag = node.GetTagName()
yield "<%s" % tag
# Output relevant attributes.
if tag == "mfenced":
for attr in "open", "close", "separators":
val = node.GetAttribute(attr, "XML-1.00")
if val:
yield ' %s="%s"' % (attr, val)
yield ">"
answer = f"<{tag}"
# Output relevant attributes
id = node.GetID()
if id:
answer += f' id="{id}"'
# The PDF interface lacks a way to get all the attributes, so we have to get specific ones
# The attributes below affect accessibility
answer += AcrobatNode.getMathMLAttributes(node, ["intent", "arg"])
match tag:
case "mi" | "mn" | "mo" | "mtext":
answer += AcrobatNode.getMathMLAttributes(node, ["mathvariant"])
case "mfenced":
answer += AcrobatNode.getMathMLAttributes(node, ["open", "close", "separators"])
case "menclose":
answer += AcrobatNode.getMathMLAttributes(node, ["notation", "notationtype"])
case "annotation-xml" | "annotation":
answer += AcrobatNode.getMathMLAttributes(node, ["encoding"])
case "ms":
answer += AcrobatNode.getMathMLAttributes(node, ["open", "close"])
case _:
pass
answer += ">"
val = node.GetValue()
if val:
yield val
answer += val
else:
for childNum in range(node.GetChildCount()):
try:
subNode = node.GetChild(childNum).QueryInterface(IPDDomElement)
except COMError:
continue
for sub in self._getNodeMathMl(subNode):
yield sub
yield "</%s>" % tag
answer += sub
return answer + f"</{tag}>"

def _get_mathMl(self) -> str:
"""Return the MathML associated with a Formula tag"""
# There are two ways that MathML can be represented in a PDF:
# 1. As a series of nested tags, each with a MathML element as the value.
# 2. As a Formula tag with MathML as the value (comes from MathML in an Associated File)
if self.pdDomNode is None:
log.debugWarning("_get_mathMl: self.pdDomNode is None!")
raise LookupError

# see if it is MathML tagging is used
for childNum in range(self.pdDomNode.GetChildCount()):
try:
child = self.pdDomNode.GetChild(childNum).QueryInterface(IPDDomElement)
except COMError:
log.debugWarning(f"COMError trying to get {childNum=}")
continue
if log.isEnabledFor(log.DEBUG):
log.debug(f"\t(PDF) get_mathMl: tag={child.GetTagName()}")
if child.GetTagName() == "math":
answer = "".join(self._getNodeMathMl(child))
log.debug(f"_get_mathMl (PDF): found tagged MathML = {answer}")
return answer

mathMl = self.pdDomNode.GetValue()
if log.isEnabledFor(log.DEBUG):
log.debug(
(
f"_get_mathMl: math recognized: {mathMl.startswith('<math')}, "
f"_get_mathMl (PDF): math recognized: {mathMl.startswith('<math')}, "
f"child count={self.pdDomNode.GetChildCount()},"
f"\n name='{self.pdDomNode.GetName()}', value='{mathMl}'"
f"\n name='{self.pdDomNode.GetName()}', value found from AF ='{mathMl}'"
),
)
# this test and the replacement doesn't work if someone uses a namespace tag (which they shouldn't, but..)
if mathMl.startswith("<math"):
return mathMl.replace('xmlns:mml="http://www.w3.org/1998/Math/MathML"', "")
# Alternative for tagging: all the sub expressions are tagged -- gather up the MathML
for childNum in range(self.pdDomNode.GetChildCount()):
try:
child = self.pdDomNode.GetChild(childNum).QueryInterface(IPDDomElement)
except COMError:
log.debugWarning(f"COMError trying to get childNum={childNum}")
continue
if log.isEnabledFor(log.DEBUG):
log.debug(f"\tget_mathMl: tag={child.GetTagName()}")
if child.GetTagName() == "math":
return "".join(self._getNodeMathMl(child))
# fall back to return the contents, which is hopefully alt text
if log.isEnabledFor(log.DEBUG):
log.debug("_get_mathMl: didn't find MathML -- returning value as mtext")
return f"<math><mtext>{self.pdDomNode.GetValue()}</mtext></math>"

# not MathML -- fall back to return the contents, which is hopefully alt text, inside an <mtext>
answer = f"<math><mtext>{mathMl}</mtext></math>"
log.debug(f"_get_mathMl: didn't find MathML -- returning value as mtext: {answer}")
return answer


class RootNode(AcrobatNode):
Expand Down
1 change: 1 addition & 0 deletions user_docs/en/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,7 @@ There are many minor bug fixes for applications, such as Thunderbird, Adobe Read

### Bug Fixes

* Fixed math attributes being read in Adobe Reader, this resulted in poor or wrong speech and braille. (#17980)
* Windows 11 fixes:
* NVDA will once again announce hardware keyboard input suggestions. (#16283, @josephsl)
* In Version 24H2 (2024 Update and Windows Server 2025), mouse and touch interaction can be used in quick settings. (#16348, @josephsl)
Expand Down