Skip to content

CLN/DOC: Adjust xpath validation and error messaging in read_xml with IO tools doc note and example #48386

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
However, if XPath does not reference node names such as default, ``/*``, then
``namespaces`` is not required.

.. note::

Since ``xpath`` identifies the parent of content to be parsed, only immediate
desendants which include child nodes or current attributes are parsed.
Therefore, ``read_xml`` will not parse the text of grandchildren or other
descendants and will not parse attributes of any descendant. To retrieve
lower level content, adjust xpath to lower level. For example,

.. ipython:: python
:okwarning:

xml = """
<data>
<row>
<shape sides="4">square</shape>
<degrees>360</degrees>
</row>
<row>
<shape sides="0">circle</shape>
<degrees>360</degrees>
</row>
<row>
<shape sides="3">triangle</shape>
<degrees>180</degrees>
</row>
</data>"""

df = pd.read_xml(xml, xpath="./row")
df

shows the attribute ``sides`` on ``shape`` element was not parsed as
expected since this attribute resides on the child of ``row`` element
and not ``row`` element itself. In other words, ``sides`` attribute is a
grandchild level descendant of ``row`` element. However, the ``xpath``
targets ``row`` element which covers only its children and attributes.

With `lxml`_ as parser, you can flatten nested XML documents with an XSLT
script which also can be string/file/URL types. As background, `XSLT`_ is
a special-purpose language written in a special XML file that can transform
Expand Down
52 changes: 35 additions & 17 deletions pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:

return dicts

def _validate_path(self) -> None:
def _validate_path(self) -> list[Any]:
"""
Validate xpath.

Expand Down Expand Up @@ -446,8 +446,7 @@ def parse_data(self) -> list[dict[str, str | None]]:

if self.iterparse is None:
self.xml_doc = self._parse_doc(self.path_or_buffer)
self._validate_path()
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
elems = self._validate_path()

self._validate_names()

Expand All @@ -459,7 +458,7 @@ def parse_data(self) -> list[dict[str, str | None]]:

return xml_dicts

def _validate_path(self) -> None:
def _validate_path(self) -> list[Any]:
"""
Notes
-----
Expand All @@ -468,18 +467,28 @@ def _validate_path(self) -> None:
"""

msg = (
"xpath does not return any nodes. "
"xpath does not return any nodes or attributes. "
"Be sure to specify in `xpath` the parent nodes of "
"children and attributes to parse. "
"If document uses namespaces denoted with "
"xmlns, be sure to define namespaces and "
"use them in xpath."
)
try:
elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
children = [ch for el in elems for ch in el.findall("*")]
attrs = {k: v for el in elems for k, v in el.attrib.items()}

if elems is None:
raise ValueError(msg)

if elems is not None and elems.find("*") is None and elems.attrib is None:
raise ValueError(msg)
if elems is not None:
if self.elems_only and children == []:
raise ValueError(msg)
elif self.attrs_only and attrs == {}:
raise ValueError(msg)
elif children == [] and attrs == {}:
raise ValueError(msg)

except (KeyError, SyntaxError):
raise SyntaxError(
Expand All @@ -488,6 +497,8 @@ def _validate_path(self) -> None:
"undeclared namespace prefix."
)

return elems

def _validate_names(self) -> None:
children: list[Any]

Expand Down Expand Up @@ -554,8 +565,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
self.xsl_doc = self._parse_doc(self.stylesheet)
self.xml_doc = self._transform_doc()

self._validate_path()
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
elems = self._validate_path()

self._validate_names()

Expand All @@ -567,25 +577,33 @@ def parse_data(self) -> list[dict[str, str | None]]:

return xml_dicts

def _validate_path(self) -> None:
def _validate_path(self) -> list[Any]:

msg = (
"xpath does not return any nodes. "
"Be sure row level nodes are in xpath. "
"xpath does not return any nodes or attributes. "
"Be sure to specify in `xpath` the parent nodes of "
"children and attributes to parse. "
"If document uses namespaces denoted with "
"xmlns, be sure to define namespaces and "
"use them in xpath."
)

elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
children = [ch for el in elems for ch in el.xpath("*")]
attrs = {k: v for el in elems for k, v in el.attrib.items()}

if elems == []:
raise ValueError(msg)

if elems != [] and attrs == [] and children == []:
raise ValueError(msg)
if elems != []:
if self.elems_only and children == []:
raise ValueError(msg)
elif self.attrs_only and attrs == {}:
raise ValueError(msg)
elif children == [] and attrs == {}:
raise ValueError(msg)

return elems

def _validate_names(self) -> None:
children: list[Any]
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,45 @@ def test_elem_and_attrs_only(datapath, parser):
read_xml(filename, elems_only=True, attrs_only=True, parser=parser)


def test_empty_attrs_only(parser):
xml = """
<data>
<row>
<shape sides="4">square</shape>
<degrees>360</degrees>
</row>
<row>
<shape sides="0">circle</shape>
<degrees>360</degrees>
</row>
<row>
<shape sides="3">triangle</shape>
<degrees>180</degrees>
</row>
</data>"""

with pytest.raises(
ValueError,
match=("xpath does not return any nodes or attributes"),
):
read_xml(xml, xpath="./row", attrs_only=True, parser=parser)


def test_empty_elems_only(parser):
xml = """
<data>
<row sides="4" shape="square" degrees="360"/>
<row sides="0" shape="circle" degrees="360"/>
<row sides="3" shape="triangle" degrees="180"/>
</data>"""

with pytest.raises(
ValueError,
match=("xpath does not return any nodes or attributes"),
):
read_xml(xml, xpath="./row", elems_only=True, parser=parser)


@td.skip_if_no("lxml")
def test_attribute_centric_xml():
xml = """\
Expand Down