pandas-dev · mroeschke · Sep 6, 2022 · Sep 4, 2022 · Sep 4, 2022 · Sep 4, 2022
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
 However, if XPath does not reference node names such as default, ``/*``, then
 ``namespaces`` is not required.
 
+.. note::
+
+   Since ``xpath`` identifies the parent of content to be parsed, only immediate
+   desendants which include child nodes or current attributes are parsed.
+   Therefore, ``read_xml`` will not parse the text of grandchildren or other
+   descendants and will not parse attributes of any descendant. To retrieve
+   lower level content, adjust xpath to lower level. For example,
+
+   .. ipython:: python
+        :okwarning:
+
+      xml = """
+      <data>
+        <row>
+          <shape sides="4">square</shape>
+          <degrees>360</degrees>
+        </row>
+        <row>
+          <shape sides="0">circle</shape>
+          <degrees>360</degrees>
+        </row>
+        <row>
+          <shape sides="3">triangle</shape>
+          <degrees>180</degrees>
+        </row>
+      </data>"""
+
+      df = pd.read_xml(xml, xpath="./row")
+      df
+
+   shows the attribute ``sides`` on ``shape`` element was not parsed as
+   expected since this attribute resides on the child of ``row`` element
+   and not ``row`` element itself. In other words, ``sides`` attribute is a
+   grandchild level descendant of ``row`` element. However, the ``xpath``
+   targets ``row`` element which covers only its children and attributes.
+
 With `lxml`_ as parser, you can flatten nested XML documents with an XSLT
 script which also can be string/file/URL types. As background, `XSLT`_ is
 a special-purpose language written in a special XML file that can transform

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -387,7 +387,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
 
         return dicts
 
-    def _validate_path(self) -> None:
+    def _validate_path(self) -> list[Any]:
         """
         Validate xpath.
 
@@ -446,8 +446,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
 
         if self.iterparse is None:
             self.xml_doc = self._parse_doc(self.path_or_buffer)
-            self._validate_path()
-            elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
+            elems = self._validate_path()
 
         self._validate_names()
 
@@ -459,7 +458,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
 
         return xml_dicts
 
-    def _validate_path(self) -> None:
+    def _validate_path(self) -> list[Any]:
         """
         Notes
         -----
@@ -468,18 +467,28 @@ def _validate_path(self) -> None:
         """
 
         msg = (
-            "xpath does not return any nodes. "
+            "xpath does not return any nodes or attributes. "
+            "Be sure to specify in `xpath` the parent nodes of "
+            "children and attributes to parse. "
             "If document uses namespaces denoted with "
             "xmlns, be sure to define namespaces and "
             "use them in xpath."
         )
         try:
-            elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
+            elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
+            children = [ch for el in elems for ch in el.findall("*")]
+            attrs = {k: v for el in elems for k, v in el.attrib.items()}
+
             if elems is None:
                 raise ValueError(msg)
 
-            if elems is not None and elems.find("*") is None and elems.attrib is None:
-                raise ValueError(msg)
+            if elems is not None:
+                if self.elems_only and children == []:
+                    raise ValueError(msg)
+                elif self.attrs_only and attrs == {}:
+                    raise ValueError(msg)
+                elif children == [] and attrs == {}:
+                    raise ValueError(msg)
 
         except (KeyError, SyntaxError):
             raise SyntaxError(
@@ -488,6 +497,8 @@ def _validate_path(self) -> None:
                 "undeclared namespace prefix."
             )
 
+        return elems
+
     def _validate_names(self) -> None:
         children: list[Any]
 
@@ -554,8 +565,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
                 self.xsl_doc = self._parse_doc(self.stylesheet)
                 self.xml_doc = self._transform_doc()
 
-            self._validate_path()
-            elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
+            elems = self._validate_path()
 
         self._validate_names()
 
@@ -567,25 +577,33 @@ def parse_data(self) -> list[dict[str, str | None]]:
 
         return xml_dicts
 
-    def _validate_path(self) -> None:
+    def _validate_path(self) -> list[Any]:
 
         msg = (
-            "xpath does not return any nodes. "
-            "Be sure row level nodes are in xpath. "
+            "xpath does not return any nodes or attributes. "
+            "Be sure to specify in `xpath` the parent nodes of "
+            "children and attributes to parse. "
             "If document uses namespaces denoted with "
             "xmlns, be sure to define namespaces and "
             "use them in xpath."
         )
 
         elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
-        children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
-        attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
+        children = [ch for el in elems for ch in el.xpath("*")]
+        attrs = {k: v for el in elems for k, v in el.attrib.items()}
 
         if elems == []:
             raise ValueError(msg)
 
-        if elems != [] and attrs == [] and children == []:
-            raise ValueError(msg)
+        if elems != []:
+            if self.elems_only and children == []:
+                raise ValueError(msg)
+            elif self.attrs_only and attrs == {}:
+                raise ValueError(msg)
+            elif children == [] and attrs == {}:
+                raise ValueError(msg)
+
+        return elems
 
     def _validate_names(self) -> None:
         children: list[Any]

diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -760,6 +760,45 @@ def test_elem_and_attrs_only(datapath, parser):
         read_xml(filename, elems_only=True, attrs_only=True, parser=parser)
 
 
+def test_empty_attrs_only(parser):
+    xml = """
+      <data>
+        <row>
+          <shape sides="4">square</shape>
+          <degrees>360</degrees>
+        </row>
+        <row>
+          <shape sides="0">circle</shape>
+          <degrees>360</degrees>
+        </row>
+        <row>
+          <shape sides="3">triangle</shape>
+          <degrees>180</degrees>
+        </row>
+      </data>"""
+
+    with pytest.raises(
+        ValueError,
+        match=("xpath does not return any nodes or attributes"),
+    ):
+        read_xml(xml, xpath="./row", attrs_only=True, parser=parser)
+
+
+def test_empty_elems_only(parser):
+    xml = """
+      <data>
+        <row sides="4" shape="square" degrees="360"/>
+        <row sides="0" shape="circle" degrees="360"/>
+        <row sides="3" shape="triangle" degrees="180"/>
+      </data>"""
+
+    with pytest.raises(
+        ValueError,
+        match=("xpath does not return any nodes or attributes"),
+    ):
+        read_xml(xml, xpath="./row", elems_only=True, parser=parser)
+
+
 @td.skip_if_no("lxml")
 def test_attribute_centric_xml():
     xml = """\