nvaccess · seanbudd · May 6, 2025 · Apr 22, 2025 · Apr 23, 2025 · May 2, 2025
@@ -119,61 +119,96 @@ def _isEqual(self, other):
 			return self.accID == other.accID
 		return super(AcrobatNode, self)._isEqual(other)
 
-	def _getNodeMathMl(self, node):
+	@staticmethod
+	def getMathMLAttributes(node: IPDDomElement, attrList: list) -> str:
+		"""Get the MathML attributes in 'attrList' for a 'node' (MathML element)."""
+		attrValues = ""
+		for attr in attrList:
+			# "NSO" comes from the PDF spec
+			val = node.GetAttribute(attr, "NSO")
+			if val:
+				attrValues += f' {attr}="{val}"'
+		return attrValues
+
+	def _getNodeMathMl(self, node: IPDDomElement) -> str:
+		"""Traverse the MathML tree and return an XML string representing the math"""
+
 		tag = node.GetTagName()
-		yield "<%s" % tag
-		# Output relevant attributes.
-		if tag == "mfenced":
-			for attr in "open", "close", "separators":
-				val = node.GetAttribute(attr, "XML-1.00")
-				if val:
-					yield ' %s="%s"' % (attr, val)
-		yield ">"
+		answer = f"<{tag}"
+		# Output relevant attributes
+		id = node.GetID()
+		if id:
+			answer += f' id="{id}"'
+		# The PDF interface lacks a way to get all the attributes, so we have to get specific ones
+		# The attributes below affect accessibility
+		answer += AcrobatNode.getMathMLAttributes(node, ["intent", "arg"])
+		match tag:
+			case "mi" | "mn" | "mo" | "mtext":
+				answer += AcrobatNode.getMathMLAttributes(node, ["mathvariant"])
+			case "mfenced":
+				answer += AcrobatNode.getMathMLAttributes(node, ["open", "close", "separators"])
+			case "menclose":
+				answer += AcrobatNode.getMathMLAttributes(node, ["notation", "notationtype"])
+			case "annotation-xml" | "annotation":
+				answer += AcrobatNode.getMathMLAttributes(node, ["encoding"])
+			case "ms":
+				answer += AcrobatNode.getMathMLAttributes(node, ["open", "close"])
+			case _:
+				pass
+		answer += ">"
 		val = node.GetValue()
 		if val:
-			yield val
+			answer += val
 		else:
 			for childNum in range(node.GetChildCount()):
 				try:
 					subNode = node.GetChild(childNum).QueryInterface(IPDDomElement)
 				except COMError:
 					continue
 				for sub in self._getNodeMathMl(subNode):
-					yield sub
-		yield "</%s>" % tag
+					answer += sub
+		return answer + f"</{tag}>"
 
 	def _get_mathMl(self) -> str:
 		"""Return the MathML associated with a Formula tag"""
+		# There are two ways that MathML can be represented in a PDF:
+		# 1. As a series of nested tags, each with a MathML element as the value.
+		# 2. As a Formula tag with MathML as the value (comes from MathML in an Associated File)
 		if self.pdDomNode is None:
 			log.debugWarning("_get_mathMl: self.pdDomNode is None!")
 			raise LookupError
+
+		# see if it is MathML tagging is used
+		for childNum in range(self.pdDomNode.GetChildCount()):
+			try:
+				child = self.pdDomNode.GetChild(childNum).QueryInterface(IPDDomElement)
+			except COMError:
+				log.debugWarning(f"COMError trying to get {childNum=}")
+				continue
+			if log.isEnabledFor(log.DEBUG):
+				log.debug(f"\t(PDF) get_mathMl: tag={child.GetTagName()}")
+			if child.GetTagName() == "math":
+				answer = "".join(self._getNodeMathMl(child))
+				log.debug(f"_get_mathMl (PDF): found tagged MathML = {answer}")
+				return answer
+
 		mathMl = self.pdDomNode.GetValue()
 		if log.isEnabledFor(log.DEBUG):
 			log.debug(
 				(
-					f"_get_mathMl: math recognized: {mathMl.startswith('<math')}, "
+					f"_get_mathMl (PDF): math recognized: {mathMl.startswith('<math')}, "
 					f"child count={self.pdDomNode.GetChildCount()},"
-					f"\n  name='{self.pdDomNode.GetName()}', value='{mathMl}'"
+					f"\n  name='{self.pdDomNode.GetName()}', value found from AF ='{mathMl}'"
 				),
 			)
 		# this test and the replacement doesn't work if someone uses a namespace tag (which they shouldn't, but..)
 		if mathMl.startswith("<math"):
 			return mathMl.replace('xmlns:mml="http://www.w3.org/1998/Math/MathML"', "")
-		# Alternative for tagging: all the sub expressions are tagged -- gather up the MathML
-		for childNum in range(self.pdDomNode.GetChildCount()):
-			try:
-				child = self.pdDomNode.GetChild(childNum).QueryInterface(IPDDomElement)
-			except COMError:
-				log.debugWarning(f"COMError trying to get childNum={childNum}")
-				continue
-			if log.isEnabledFor(log.DEBUG):
-				log.debug(f"\tget_mathMl: tag={child.GetTagName()}")
-			if child.GetTagName() == "math":
-				return "".join(self._getNodeMathMl(child))
-		# fall back to return the contents, which is hopefully alt text
-		if log.isEnabledFor(log.DEBUG):
-			log.debug("_get_mathMl: didn't find MathML -- returning value as mtext")
-		return f"<math><mtext>{self.pdDomNode.GetValue()}</mtext></math>"
+
+		# not MathML -- fall back to return the contents, which is hopefully alt text, inside an <mtext>
+		answer = f"<math><mtext>{mathMl}</mtext></math>"
+		log.debug(f"_get_mathMl: didn't find MathML -- returning value as mtext: {answer}")
+		return answer
 
 
 class RootNode(AcrobatNode):

@@ -598,6 +598,7 @@ There are many minor bug fixes for applications, such as Thunderbird, Adobe Read
 
 ### Bug Fixes
 
+* Fixed math attributes being read in Adobe Reader, this resulted in poor or wrong speech and braille. (#17980)
 * Windows 11 fixes:
   * NVDA will once again announce hardware keyboard input suggestions. (#16283, @josephsl)
   * In Version 24H2 (2024 Update and Windows Server 2025), mouse and touch interaction can be used in quick settings. (#16348, @josephsl)