From b67d87670dcd59fb743ace757d31a9077a092e4b Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 31 Jan 2021 18:32:55 -0600 Subject: [PATCH 01/35] ENH: Add i/o support of XML with pandas.read_xml and DataFrame.to_xml (GH27554) --- doc/source/reference/io.rst | 7 + doc/source/whatsnew/v1.3.0.rst | 30 + pandas/__init__.py | 1 + pandas/core/frame.py | 172 +++ pandas/io/api.py | 1 + pandas/io/formats/format.py | 117 ++ pandas/io/formats/xml.py | 763 ++++++++++++ pandas/io/xml.py | 1017 +++++++++++++++ pandas/tests/io/data/xml/baby_names.xml | 53 + pandas/tests/io/data/xml/books.xml | 21 + pandas/tests/io/data/xml/cta_rail_lines.kml | 92 ++ pandas/tests/io/data/xml/flatten_doc.xsl | 18 + pandas/tests/io/data/xml/row_field_output.xsl | 19 + pandas/tests/io/formats/test_to_xml.py | 1099 +++++++++++++++++ pandas/tests/io/test_xml.py | 708 +++++++++++ 15 files changed, 4118 insertions(+) create mode 100644 pandas/io/formats/xml.py create mode 100644 pandas/io/xml.py create mode 100644 pandas/tests/io/data/xml/baby_names.xml create mode 100644 pandas/tests/io/data/xml/books.xml create mode 100644 pandas/tests/io/data/xml/cta_rail_lines.kml create mode 100644 pandas/tests/io/data/xml/flatten_doc.xsl create mode 100644 pandas/tests/io/data/xml/row_field_output.xsl create mode 100644 pandas/tests/io/formats/test_to_xml.py create mode 100644 pandas/tests/io/test_xml.py diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index e755ce94812bb..442631de50c7a 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -68,6 +68,13 @@ HTML read_html +XML +~~~~ +.. autosummary:: + :toctree: api/ + + read_xml + HDFStore: PyTables (HDF5) ~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 931ec895cc73f..0034d1bb3ecbc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -39,6 +39,36 @@ For example: ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See ref:`window.overview` for performance and functional benefits. (:issue:`15095`) +.. _whatsnew_130.read_to_xml: + +We added to support to read and generate shallow versions of xml documents. +With lxml as parser, full XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) + +.. ipython:: python + + xml = """ + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + + """ + + df = pd.read_xml(xml) + + df.to_xml() + .. _whatsnew_130.enhancements.other: Other enhancements diff --git a/pandas/__init__.py b/pandas/__init__.py index cc5d835a52833..cddd6397de33e 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -167,6 +167,7 @@ read_feather, read_gbq, read_html, + read_xml, read_json, read_stata, read_sas, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 03d439bd461da..60fe9ca3f6430 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2551,6 +2551,178 @@ def to_html( render_links=render_links, ) + def to_xml( + self, + io: Optional[FilePathOrBuffer[str]] = None, + index: Optional[bool] = True, + root_name: Optional[str] = "data", + row_name: Optional[str] = "row", + na_rep: Optional[str] = None, + attr_cols: Optional[Union[str, List[str]]] = None, + elem_cols: Optional[Union[str, List[str]]] = None, + namespaces: Optional[Union[dict, List[dict]]] = None, + prefix: Optional[str] = None, + encoding: Optional[str] = "utf-8", + xml_declaration: Optional[bool] = True, + pretty_print: Optional[bool] = True, + parser: Optional[str] = "lxml", + stylesheet: Optional[FilePathOrBuffer[str]] = None, + ) -> Optional[str]: + """ + Render a DataFrame to an XML document. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + io : str, path object or file-like object, optional + File to write output to. If None, the output is returned as a + string. + index : bool, optional + Whether to include index in XML document. + root_name : str, default 'data' + The name of root element in XML document. + root_name : str, default 'row' + The name of row element in XML document. + na_rep : str, optional + Missing data representation. + attr_cols : list-like, optional + List of columns to write as attributes in row element. + Hierarchical columns will be flattened with underscore + delimiting the different levels. + elem_cols : list-like, optional + List of columns to write as children in row element. By default, + all columns output as children of row element. Hierarchical + columns will be flattened with underscore delimiting the + different levels. + namespaces : dict, optional + All namespaces to be defined in root element. Keys of dict + should be prefix names and values of dict corresponding URIs. + Default namespaces should be given empty string key. For + example, :: + + namespaces = {'': 'https://example.com'} + + prefix : str, optional + Namespace prefix to be used for every element and/or attribute + in document. This should be one of the keys in ``namespaces`` + dict. + encoding : str, optional, default 'utf-8' + Encoding of the resulting document. + xml_declaration : str, optional + Whether to include the XML declaration at start of document. + pretty_print : bool, optional + Whether output should be pretty printed with indentation and + line breaks. + parser : {'lxml','etree'}, default "lxml" + Parser module to use for building of tree. Only 'lxml' and + 'etree' are supported. With 'lxml', the ability to use XSLT + stylesheet is supported. Default parser uses 'lxml'. If + module is not installed a warning will raise and process + will continue with 'etree'. + stylesheet : str, path object or file-like object, optional + A URL, file-like object, or a raw string containing an XSLT + script used to transform the raw XML output. Script should use + layout of elements and attributes from original output. This + argument requires ``lxml`` to be installed. Only XSLT 1.0 + scripts and not later versions is currently supported. + + Returns + ------- + None or str + If ``io`` is None, returns the resulting XML format as a + string. Otherwise returns None. + + See Also + -------- + to_json : Convert the pandas object to a JSON string. + to_html : Convert DataFrame to a html. + + Examples + -------- + >>> df = pd.DataFrame({'shape': ['square', 'circle', 'triangle'], + ... 'degrees': [360, 360, 180], + ... 'sides': [4, np.nan, 3]}) + + >>> df.to_xml() + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + + >>> df.to_xml(attr_cols=['index', 'shape', 'degrees', 'sides']) + + + + + + + + >>> df.to_xml(namespaces = {"doc": "https://example.com"}, + ... prefix = "doc") + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + """ + + formatter = fmt.DataFrameFormatter( + self, + index=index, + na_rep=na_rep, + ) + + return fmt.DataFrameRenderer(formatter).to_xml( + io=io, + index=index, + root_name=root_name, + row_name=row_name, + na_rep=na_rep, + attr_cols=attr_cols, + elem_cols=elem_cols, + namespaces=namespaces, + prefix=prefix, + encoding=encoding, + xml_declaration=xml_declaration, + pretty_print=pretty_print, + parser=parser, + stylesheet=stylesheet, + ) + # ---------------------------------------------------------------------- @Substitution( klass="DataFrame", diff --git a/pandas/io/api.py b/pandas/io/api.py index 2d25ffe5f8a6b..ad514014c3e6d 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -19,3 +19,4 @@ from pandas.io.spss import read_spss from pandas.io.sql import read_sql, read_sql_query, read_sql_table from pandas.io.stata import read_stata +from pandas.io.xml import read_xml diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b3c2411304f6b..a372cd4f16119 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -28,6 +28,7 @@ cast, ) from unicodedata import east_asian_width +from warnings import warn import numpy as np @@ -913,6 +914,7 @@ class DataFrameRenderer: Called in pandas.core.frame.DataFrame: - to_html + - to_xml - to_string Parameters @@ -1002,6 +1004,121 @@ def to_html( string = html_formatter.to_string() return save_to_buffer(string, buf=buf, encoding=encoding) + def to_xml( + self, + io: Optional[FilePathOrBuffer[str]] = None, + index: Optional[bool] = True, + root_name: Optional[str] = "data", + row_name: Optional[str] = "row", + na_rep: Optional[str] = None, + attr_cols: Optional[Union[str, List[str]]] = None, + elem_cols: Optional[Union[str, List[str]]] = None, + namespaces: Optional[Union[dict, List[dict]]] = None, + prefix: Optional[str] = None, + encoding: Optional[str] = "utf-8", + xml_declaration: Optional[bool] = True, + pretty_print: Optional[bool] = True, + parser: Optional[str] = "lxml", + stylesheet: Optional[FilePathOrBuffer[str]] = None, + ) -> Optional[str]: + """ + Render a DataFrame to an XML document. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + io : str, path object or file-like object, optional + File to write output to. If None, the output is returned as a + string. + index : bool, optional + Whether to include index in XML document. + root_name : str, default 'data' + The name of root element in XML document. + root_name : str, default 'row' + The name of row element in XML document. + na_rep : str, optional + Missing data representation. + attr_cols : list-like, optional + List of columns to write as attributes in row element. + Hierarchical columns will be flattened with underscore + delimiting the different levels. + elem_cols : list-like, optional + List of columns to write as children in row element. By default, + all columns output as children of row element. Hierarchical + columns will be flattened with underscore delimiting the + different levels. + namespaces : dict, optional + All namespaces to be defined in root element. Keys of dict + should be prefix names and values of dict corresponding URIs. + Default namespaces should be given empty string key. For + example, :: + + namespaces = {'': 'https://example.com'} + + prefix : str, optional + Namespace prefix to be used for every element and/or attribute + in document. This should be one of the keys in ``namespaces`` + dict. + encoding : str, optional, default 'utf-8' + Encoding of the resulting document. + xml_declaration : str, optional + Whether to include the XML declaration at start of document. + pretty_print : bool, optional + Whether output should be pretty printed with indentation and + line breaks. + parser : {'lxml','etree'}, default "lxml" + Parser module to use for building of tree. Only 'lxml' and + 'etree' are supported. With 'lxml', the ability to use XSLT + stylesheet is supported. Default parser uses 'lxml'. If + module is not installed a warning will raise and process + will continue with 'etree'. + stylesheet : str, path object or file-like object, optional + A URL, file-like object, or a raw string containing an XSLT + script used to transform the raw XML output. Script should use + layout of elements and attributes from original output. This + argument requires ``lxml`` to be installed. Only XSLT 1.0 + scripts and not later versions is currently supported. + """ + + from pandas.io.formats.xml import EtreeXMLFormatter, LxmlXMLFormatter + + if parser == "lxml": + try: + TreeBuilder = LxmlXMLFormatter + except ImportError: + warn( + "You do not have lxml installed (default parser). " + "Instead, etree will be used.", + ImportWarning, + ) + TreeBuilder = EtreeXMLFormatter + + elif parser == "etree": + TreeBuilder = EtreeXMLFormatter + + else: + raise ValueError("Values for parser can only be lxml or etree.") + + xml_formatter = TreeBuilder( + self.fmt, + io=io, + index=index, + root_name=root_name, + row_name=row_name, + na_rep=na_rep, + attr_cols=attr_cols, + elem_cols=elem_cols, + namespaces=namespaces, + prefix=prefix, + encoding=encoding, + xml_declaration=xml_declaration, + pretty_print=pretty_print, + stylesheet=stylesheet, + ) + + return xml_formatter.write_output() + def to_string( self, buf: Optional[FilePathOrBuffer[str]] = None, diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py new file mode 100644 index 0000000000000..86448c9d4498f --- /dev/null +++ b/pandas/io/formats/xml.py @@ -0,0 +1,763 @@ +""" +Module for formatting output data in XML. +""" + +import codecs +import io +from typing import Dict, List, Optional, Union +from urllib.error import HTTPError, URLError +from warnings import warn + +from pandas._typing import FilePathOrBuffer + +from pandas.core.dtypes.common import is_list_like + +from pandas.io.common import is_url, urlopen +from pandas.io.formats.format import DataFrameFormatter + + +class EtreeXMLFormatter: + """ + Class for formatting data in xml using Python standard library + modules: `xml.etree.ElementTree` and `xml.dom.minidom`. + + Parameters + ---------- + io : str or file-like + This can be either a string of raw XML, a valid URL, + file or file-like object. + + index : bool + Whether to include index in xml document. + + row_name : str + Name for root of xml document. Default is 'data'. + + root_name : str + Name for row elemens of xml document. Default is 'row'. + + na_rep : str + Missing data representation. + + attrs_cols : list + List of columns to write as attributes in row element. + + elem_cols : list + List of columns to write as children in row element. + + namespacess : dict + The namespaces to define in XML document as dicts with key + being namespace and value the URI. + + prefix : str + The prefix for each element in XML document including root. + + encoding : str + Encoding of xml object or document. + + xml_declaration : bool + Whether to include xml declaration at top line item in xml. + + pretty_print : bool + Whether to write xml document with line breaks and indentation. + + stylesheet : str or file-like + A URL, file, file-like object, or a raw string containing XSLT, + `etree` does not support XSLT but retained for consistency. + + See also + -------- + pandas.io.formats.xml.LxmlXMLFormatter + + Notes + ----- + This class serves as fall back option if user does not have + ``lxml`` installed or user specifically requests ``etree`` parser. + """ + + def __init__( + self, + formatter: DataFrameFormatter, + io: Optional[FilePathOrBuffer[str]] = None, + index: Optional[bool] = True, + root_name: Optional[str] = "data", + row_name: Optional[str] = "row", + na_rep: Optional[str] = None, + attr_cols: Optional[Union[str, List[str]]] = None, + elem_cols: Optional[Union[str, List[str]]] = None, + namespaces: Optional[Dict[str, str]] = None, + prefix: Optional[str] = None, + encoding: Optional[str] = "utf-8", + xml_declaration: Optional[bool] = True, + pretty_print: Optional[bool] = True, + stylesheet: Optional[FilePathOrBuffer[str]] = None, + ) -> None: + self.fmt = formatter + self.io = io + self.index = index + self.root_name = root_name + self.row_name = row_name + self.na_rep = na_rep + self.attr_cols = attr_cols + self.elem_cols = elem_cols + self.namespaces = namespaces + self.prefix = prefix + self.encoding = encoding + self.xml_declaration = xml_declaration + self.pretty_print = pretty_print + self.stylesheet = stylesheet + self.frame = self.fmt.frame + + self.validate_columns() + self.validate_encoding() + self.orig_cols = self.fmt.frame.columns.tolist() + self.frame_dicts = self.process_dataframe() + self.handle_indexes() + self.prefix_uri = self.get_prefix_uri() + + def build_tree(self) -> bytes: + """ + Build tree from data. + + This method initializes the root and builds attributes and elements + with optional namespaces. + """ + from xml.etree.ElementTree import Element, SubElement, tostring + + self.root = Element( + f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces() + ) + + for k, d in self.frame_dicts.items(): + self.d = d + self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") + + if self.attr_cols: + self.build_attribs() + if self.elem_cols: + self.build_elems() + if not self.attr_cols and not self.elem_cols: + self.elem_cols = list(self.frame_dicts[0].keys()) + self.build_elems() + + self.out_xml = tostring(self.root, method="xml", encoding=self.encoding) + + if self.pretty_print: + self.out_xml = self.prettify_tree() + + if not self.xml_declaration: + self.out_xml = self.remove_declaration() + + if self.stylesheet: + warn( + "To use stylesheet, you need lxml installed. " + "The non-transformed, original XML is returned instead.", + UserWarning, + ) + + return self.out_xml + + def validate_columns(self) -> None: + """ + Validate elems_cols and attrs_cols. + + This method will check if columns is list-like. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + if self.attr_cols and not is_list_like(self.attr_cols): + raise TypeError( + f"{type(self.attr_cols).__name__} is not a valid type for attr_cols" + ) + + if self.elem_cols and not is_list_like(self.elem_cols): + raise TypeError( + f"{type(self.elem_cols).__name__} is not a valid type for elem_cols" + ) + + def validate_encoding(self) -> None: + """ + Validate encoding. + + This method will check if encoding is among listed under codecs. + + Raises + ------ + LookupError + * If encoding is not available in codecs. + """ + + try: + codecs.lookup(self.encoding) + except LookupError as e: + raise e + + def process_dataframe(self) -> None: + """ + Adjust Data Frame to fit xml output. + + This method will adjust underlying data frame for xml output, + including replacing missing entities and including indexes. + """ + + na_dict = {"None": self.na_rep, "NaN": self.na_rep, "nan": self.na_rep} + + df = ( + (self.fmt.frame.reset_index().applymap(str).replace(na_dict)) + if self.index + else self.fmt.frame.applymap(str).replace(na_dict) + ) + + return df.to_dict(orient="index") + + def handle_indexes(self) -> None: + """ + Handle indexes. + + This method will add indexes into attr_cols or elem_cols. + """ + + indexes = [x for x in self.frame_dicts[0].keys() if x not in self.orig_cols] + + if self.attr_cols and self.index: + self.attr_cols = list(indexes) + self.attr_cols + + if self.elem_cols and self.index: + self.elem_cols = list(indexes) + self.elem_cols + + def get_prefix_uri(self) -> str: + """ + Get uri of namespace prefix. + + This method retrieves corresponding URI to prefix in namespaces. + """ + + from xml.etree.ElementTree import register_namespace + + uri = "" + if self.namespaces: + for p, n in self.namespaces.items(): + register_namespace(p, n) + if self.prefix: + try: + uri = f"{{{self.namespaces[self.prefix]}}}" + except (KeyError): + raise KeyError("prefix is not included in namespaces") + else: + uri = f'{{{self.namespaces[""]}}}' + + return uri + + def other_namespaces(self) -> dict: + """ + Define other namespaces. + + This method will build dictionary of namespaces attributes + for root element, conditionally with optional namespaces and + prefix. + """ + + nmsp_dict = {} + if self.namespaces and self.prefix is None: + nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p != ""} + + if self.namespaces and self.prefix: + nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p == ""} + + return nmsp_dict + + def build_attribs(self) -> None: + """ + Create attributes of row. + + This method adds attributes using attr_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + + for col in self.attr_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + attr_name = f"{self.prefix_uri}{flat_col}" + try: + if self.d[col] is not None: + self.elem_row.attrib[attr_name] = str(self.d[col]) + except KeyError: + raise KeyError(f"no valid column, {col}") + + def build_elems(self) -> None: + """ + Create child elements of row. + + This method adds child elements using elem_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + + from xml.etree.ElementTree import SubElement + + for col in self.elem_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + elem_name = f"{self.prefix_uri}{flat_col}" + try: + val = None if self.d[col] in [None, ""] else str(self.d[col]) + SubElement(self.elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") + + def prettify_tree(self) -> bytes: + """ + Output tree for pretty print format. + + This method will pretty print xml with line breaks and indentation. + """ + + from xml.dom.minidom import parseString + + dom = parseString(self.out_xml) + + return dom.toprettyxml(indent=" ", encoding=self.encoding) + + def remove_declaration(self) -> None: + """ + Remove xml declaration. + + This method will remove xml declaration of working tree. Currently, + pretty_print is not supported in etree. + """ + + return self.out_xml.split(b"?>")[-1].strip() + + def write_output(self) -> Optional[str]: + xml_doc = self.build_tree() + + try: + if self.io: + with open(self.io, "wb") as f: + f.write(xml_doc) + xml_doc = None + else: + xml_doc = xml_doc.decode(self.encoding).rstrip() + except (UnicodeDecodeError, OSError) as e: + raise e + + return xml_doc + + +class LxmlXMLFormatter: + """ + Class for formatting data in xml using Python standard library + modules: `xml.etree.ElementTree` and `xml.dom.minidom`. + + Parameters + ---------- + io : str or file-like + This can be either a string of raw XML, a valid URL, + file or file-like object. + + index : bool + Whether to include index in xml document. + + row_name : str + Name for root of xml document. Default is 'data'. + + root_name : str + Name for row elemens of xml document. Default is 'row'. + + na_rep : str + Missing data representation. + + attrs_cols : list + List of columns to write as attributes in row element. + + elem_cols : list + List of columns to write as children in row element. + + namespacess : dict + The namespaces to define in XML document as dicts with key + being namespace and value the URI. + + prefix : str + The prefix for each element in XML document including root. + + encoding : str + Encoding of xml object or document. + + xml_declaration : bool + Whether to include xml declaration at top line item in xml. + + pretty_print : bool + Whether to write xml document with line breaks and indentation. + + stylesheet : str or file-like + A URL, file, file-like object, or a raw string containing XSLT. + + See also + -------- + pandas.io.formats.xml.EtreeXMLFormatter + + Notes + ----- + This class serves as default option. If user does not have `lxml` + installed, `to_xml` will fall back with EtreeXMLFormatter. + """ + + def __init__( + self, + formatter: DataFrameFormatter, + io: Optional[FilePathOrBuffer[str]] = None, + index: Optional[bool] = True, + root_name: Optional[str] = "data", + row_name: Optional[str] = "row", + na_rep: Optional[str] = None, + attr_cols: Optional[Union[str, List[str]]] = None, + elem_cols: Optional[Union[str, List[str]]] = None, + namespaces: Optional[Dict[str, str]] = None, + prefix: Optional[str] = None, + encoding: Optional[str] = "utf-8", + xml_declaration: Optional[bool] = True, + pretty_print: Optional[bool] = True, + stylesheet: Optional[FilePathOrBuffer[str]] = None, + ) -> None: + self.fmt = formatter + self.io = io + self.index = index + self.root_name = root_name + self.row_name = row_name + self.na_rep = na_rep + self.attr_cols = attr_cols + self.elem_cols = elem_cols + self.namespaces = namespaces + self.prefix = prefix + self.encoding = encoding + self.xml_declaration = xml_declaration + self.pretty_print = pretty_print + self.stylesheet = stylesheet + + self.validate_columns() + self.validate_encoding() + self.orig_cols = self.fmt.frame.columns.tolist() + self.frame_dicts = self.process_dataframe() + self.prefix_uri = self.get_prefix_uri() + + self.convert_empty_str_key() + self.handle_indexes() + + def build_tree(self) -> bytes: + """ + Build tree from data. + + This method initializes the root and builds attributes and elements + with optional namespaces. + """ + from lxml.etree import Element, SubElement, tostring + + self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces) + + for k, d in self.frame_dicts.items(): + self.d = d + self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") + + if self.attr_cols: + self.build_attribs() + + if self.elem_cols: + self.build_elems() + + if not self.attr_cols and not self.elem_cols: + self.elem_cols = list(self.frame_dicts[0].keys()) + self.build_elems() + + self.out_xml = tostring( + self.root, + pretty_print=self.pretty_print, + method="xml", + encoding=self.encoding, + xml_declaration=self.xml_declaration, + ) + + if self.stylesheet: + self.out_xml = self.transform_doc() + + return self.out_xml + + def validate_columns(self) -> None: + """ + Validate elems_cols and attrs_cols. + + This method will check if columns is list-like. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + if self.attr_cols and not is_list_like(self.attr_cols): + raise TypeError( + f"{type(self.attr_cols).__name__} is not a valid type for attr_cols" + ) + + if self.elem_cols and not is_list_like(self.elem_cols): + raise TypeError( + f"{type(self.elem_cols).__name__} is not a valid type for elem_cols" + ) + + def validate_encoding(self) -> None: + """ + Validate encoding. + + This method will check if encoding is among listed under codecs. + + Raises + ------ + LookupError + * If encoding is not available in codecs. + """ + + try: + codecs.lookup(self.encoding) + except LookupError as e: + raise e + + def process_dataframe(self) -> dict: + """ + Adjust Data Frame to fit xml output. + + This method will adjust underlying data frame for xml output, + including replacing missing entities and including indexes. + """ + + na_dict = {"None": self.na_rep, "NaN": self.na_rep, "nan": self.na_rep} + + df = ( + (self.fmt.frame.reset_index().applymap(str).replace(na_dict)) + if self.index + else self.fmt.frame.applymap(str).replace(na_dict) + ) + + return df.to_dict(orient="index") + + def convert_empty_str_key(self) -> None: + """ + Replace zero-lengh string in `namespaces`. + + This method will replce '' with None to align to `lxml` + requirement that empty string prefixes are not allowed. + """ + + if self.namespaces and "" in self.namespaces.keys(): + self.namespaces[None] = self.namespaces.pop("", "default") + + def handle_indexes(self) -> None: + """ + Handle indexes. + + This method will add indexes into attr_cols or elem_cols. + """ + indexes = [x for x in self.frame_dicts[0].keys() if x not in self.orig_cols] + + if self.attr_cols and self.index: + self.attr_cols = list(indexes) + self.attr_cols + + if self.elem_cols and self.index: + self.elem_cols = list(indexes) + self.elem_cols + + def get_prefix_uri(self) -> str: + """ + Get uri of namespace prefix. + + This method retrieves corresponding URI to prefix in namespaces. + + Raises + ------ + ValueError + *If prefix is not included in namespace dict. + """ + + uri = "" + if self.namespaces: + if self.prefix: + try: + uri = f"{{{self.namespaces[self.prefix]}}}" + except (KeyError): + raise KeyError("prefix is not included in namespaces") + else: + uri = f'{{{self.namespaces[""]}}}' + + return uri + + def build_attribs(self) -> None: + """ + Create attributes of row. + + This method adds attributes using attr_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + for col in self.attr_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + attr_name = f"{self.prefix_uri}{flat_col}" + try: + if self.d[col] is not None: + self.elem_row.attrib[attr_name] = self.d[col] + except KeyError: + raise KeyError(f"no valid column, {col}") + + def build_elems(self) -> None: + """ + Create child elements of row. + + This method adds child elements using elem_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + from lxml.etree import SubElement + + for col in self.elem_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + elem_name = f"{self.prefix_uri}{flat_col}" + try: + val = None if self.d[col] in [None, ""] else str(self.d[col]) + SubElement(self.elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") + + def convert_io(self) -> Union[None, str]: + """ + Convert stylesheet object to string. + + This method will convert stylesheet object into a string or keep + as string, depending on object type. + """ + + obj = None + + if isinstance(self.stylesheet, str): + obj = self.stylesheet + + if isinstance(self.stylesheet, bytes): + obj = self.stylesheet.decode(self.encoding) + + if isinstance(self.stylesheet, io.StringIO): + obj = self.stylesheet.getvalue() + + if isinstance(self.stylesheet, io.BytesIO): + obj = self.stylesheet.getvalue().decode(self.encoding) + + if isinstance(self.stylesheet, io.TextIOWrapper): + obj = self.stylesheet.read() + + if isinstance(self.stylesheet, io.BufferedReader): + obj = self.stylesheet.read().decode(self.encoding) + + return obj + + def parse_doc(self): + """ + Build tree from stylesheet. + + This method will parse stylesheet object into tree for parsing + conditionally by its specific object type. + + Raises + ------ + HttpError + * If URL cannot be reached. + + LookupError + * If xml document has incorrect or unknown encoding. + + OSError + * If file cannot be found. + + XMLSyntaxError + * If xml document conntains syntax issues. + + ValueError + * If io object is not readable as string or file-like object. + """ + + from lxml.etree import XML, XMLParser, XMLSyntaxError, parse + + current_doc = self.convert_io() + if current_doc: + is_xml = current_doc.startswith((" bytes: + """ + Transform original tree using stylesheet. + + This method will transform built tree with XSLT script. + """ + + from lxml.etree import XSLT, XSLTApplyError, XSLTParseError + + xsl_doc = self.parse_doc() + + try: + transformer = XSLT(xsl_doc) + new_doc = transformer(self.root) + + except (XSLTApplyError, XSLTParseError) as e: + raise e + + return bytes(new_doc) + + def write_output(self) -> Optional[str]: + xml_doc = self.build_tree() + + try: + if self.io: + with open(self.io, "wb") as f: + f.write(xml_doc) + xml_doc = None + else: + xml_doc = xml_doc.decode(self.encoding).rstrip() + + except (UnicodeDecodeError, OSError) as e: + raise e + + return xml_doc diff --git a/pandas/io/xml.py b/pandas/io/xml.py new file mode 100644 index 0000000000000..af2004c05428c --- /dev/null +++ b/pandas/io/xml.py @@ -0,0 +1,1017 @@ +""" +:mod:`pandas.io.xml` is a module containing functionality for dealing with +XML IO. + +""" + +import io +from typing import Dict, List, Optional, Union +from urllib.error import HTTPError, URLError +from warnings import warn + +from pandas._typing import FilePathOrBuffer +from pandas.errors import ParserError +from pandas.util._decorators import deprecate_nonkeyword_arguments + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.frame import DataFrame + +from pandas.io.common import is_url, stringify_path, urlopen +from pandas.io.parsers import TextParser + + +class _EtreeFrameParser: + """ + Internal class to parse XML into DataFrames with the Python + standard library XML modules: `xml.etree.ElementTree`. + + Parameters + ---------- + io : str or file-like + This can be either a string of raw XML, a valid URL, + file or file-like object. + + xpath : str or regex + The XPath expression to parse required set of nodes for + migration to `Data Frame`. `etree` supports limited XPath. + + namespacess : dict + The namespaces defined in XML document (`xmlns:namespace='URI') + as dicts with key being namespace and value the URI. + + elems_only : bool + Parse only the child elements at the specified `xpath`. + + attrs_only : bool + Parse only the attributes at the specified `xpath`. + + names : list + Column names for Data Frame of parsed XML data. + + encoding : str + Encoding of xml object or document. + + stylesheet : str or file-like + URL, file, file-like object, or a raw string containing XSLT, + `etree` does not support XSLT but retained for consistency. + + See also + -------- + pandas.io.xml._LxmlFrameParser + + Notes + ----- + This class serves as fall back option if user does not have + ``lxml`` installed or user specifically requests ``etree`` parser. + """ + + from xml.etree.ElementTree import Element, ElementTree + + def __init__( + self, + io, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + stylesheet, + ): + self.io = io + self.xpath = xpath + self.namespaces = namespaces + self.elems_only = elems_only + self.attrs_only = attrs_only + self.names = names + self.encoding = encoding + self.stylesheet = stylesheet + + def parse_data(self) -> List[Dict[str, List[str]]]: + """ + Parse xml data. + + This method will call the other internal methods to + validate xpath, names, parse and return specific nodes. + """ + + if self.stylesheet: + warn( + "To use stylesheet, you need lxml installed. " + "Nodes will be parsed on original XML at the xpath.", + UserWarning, + ) + + self.xml_doc = self._parse_doc() + + self._validate_path() + self._validate_names() + + return self._parse_nodes() + + def _parse_nodes(self) -> List[Dict[str, List[str]]]: + """ + Parse xml nodes. + + This method will parse the children and attributes of elements + in xpath, conditionally for only elements, only attributes + or both while optionally renaming node names. + + Raises + ------ + ValueError + * If only elements and only attributes are specified. + + Notes + ----- + Namespace URIs will be removed from return node values.Also, + elements with missing children or attributes compared to siblings + will have optional keys filled withi None values. + """ + + elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) + + if self.elems_only and self.attrs_only: + raise ValueError("Either element or attributes can be parsed not both.") + elif self.elems_only: + if self.names: + dicts = [ + { + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.findall("*")) + }, + } + for el in elems + ] + else: + dicts = [ + { + ch.tag: ch.text.strip() if ch.text else None + for ch in el.findall("*") + } + for el in elems + ] + + elif self.attrs_only: + dicts = [el.attrib for el in elems] + + else: + if self.names: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.findall("*")) + }, + } + for el in elems + ] + + else: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + ch.tag: ch.text.strip() if ch.text else None + for ch in el.findall("*") + }, + } + for el in elems + ] + + if self.namespaces: + dicts = [ + {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} + for d in dicts + ] + + keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) + dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] + + if self.names: + dicts = [ + {nm: v for nm, (k, v) in zip(self.names, d.items())} for d in dicts + ] + + return dicts + + def _validate_path(self) -> None: + """ + Validate xpath. + + This method checks for sytnax, evaluation, or empty nodes return. + + Raises + ------ + SyntaxError + * If xpah is not supported or issues with namespaces. + + Notes + ----- + `etree` supports limited XPath. If user attempts a more complex + expression syntax error will raise. + """ + + msg = ( + "xpath does not return any nodes. " + "If document uses namespaces denoted with " + "xmlns, be sure to define namespaces and " + "use them in xpath." + ) + try: + elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces) + if elems is None: + raise ValueError(msg) + + if elems is not None and elems.find("*") is None and elems.attrib is None: + raise ValueError(msg) + + except (KeyError, SyntaxError): + raise SyntaxError( + "You have used an incorrect or unsupported XPath " + "expression for etree library or you used an " + "undeclared namespace prefix." + ) + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list-like and aligns + with length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + if self.names: + children = self.xml_doc.find( + self.xpath, namespaces=self.namespaces + ).findall("*") + + if is_list_like(self.names): + if len(self.names) < len(children): + raise ValueError( + "names does not match length of child elements in xpath." + ) + else: + raise TypeError( + f"{type(self.names).__name__} is not a valid type for names" + ) + + def _convert_io(self) -> Union[None, str]: + """ + Convert io object to string. + + This method will convert io object into a string or keep + as string, depending on object type. + """ + + obj = None + + if isinstance(self.io, str): + obj = self.io + + if isinstance(self.io, bytes): + obj = self.io.decode(self.encoding) + + if isinstance(self.io, io.StringIO): + obj = self.io.getvalue() + + if isinstance(self.io, io.BytesIO): + obj = self.io.getvalue().decode(self.encoding) + + if isinstance(self.io, io.TextIOWrapper): + obj = self.io.read() + + if isinstance(self.io, io.BufferedReader): + obj = self.io.read().decode(self.encoding) + + return obj + + def _parse_doc(self) -> Union[Element, ElementTree]: + """ + Build tree from io. + + This method will parse io object into tree for parsing + conditionally by its specific object type. + + Raises + ------ + HttpError + * If URL cannot be reached. + + OSError + * If file cannot be found. + + ParseError + * If xml document conntains syntax issues. + + ValueError + * If io object is not readable as string or file-like object. + """ + + from xml.etree.ElementTree import ParseError, fromstring, parse + + current_doc = self._convert_io() + if current_doc: + is_xml = current_doc.startswith((" List[Dict[str, List[str]]]: + """ + Parse xml data. + + This method will call the other internal methods to + validate xpath, names, optionally parse and run XSLT, + and parse original or transformed XML and return specific nodes. + """ + + self.xml_doc = self._parse_doc() + + if self.stylesheet: + self.is_style = True + self.xsl_doc = self._parse_doc() + self.xml_doc = self._transform_doc() + + self._validate_path() + self._validate_names() + + return self._parse_nodes() + + def _parse_nodes(self) -> List[Dict[str, List[str]]]: + """ + Parse xml nodes. + + This method will parse the children and attributes of elements + in xpath, conditionally for only elements, only attributes + or both while optionally renaming node names. + + Raises + ------ + ValueError + * If only elements and only attributes are specified. + + Notes + ----- + Namespace URIs will be removed from return node values.Also, + elements with missing children or attributes compared to siblings + will have optional keys filled withi None values. + """ + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + + if self.elems_only and self.attrs_only: + raise ValueError("Either element or attributes can be parsed not both.") + + elif self.elems_only: + if self.names: + dicts = [ + { + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.xpath("*")) + }, + } + for el in elems + ] + else: + dicts = [ + { + ch.tag: ch.text.strip() if ch.text else None + for ch in el.xpath("*") + } + for el in elems + ] + + elif self.attrs_only: + dicts = [el.attrib for el in elems] + + else: + if self.names: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + nm: ch.text.strip() if ch.text else None + for nm, ch in zip(self.names, el.xpath("*")) + }, + } + for el in elems + ] + else: + dicts = [ + { + **el.attrib, + **( + {el.tag: el.text.strip()} + if el.text and not el.text.isspace() + else {} + ), + **{ + ch.tag: ch.text.strip() if ch.text else None + for ch in el.xpath("*") + }, + } + for el in elems + ] + + if self.namespaces or "}" in list(dicts[0].keys())[0]: + dicts = [ + {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} + for d in dicts + ] + + keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) + dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] + + if self.names: + dicts = [ + {nm: v for nm, (k, v) in zip(self.names, d.items())} for d in dicts + ] + + return dicts + + def _transform_doc(self): + """ + Transform original tree using stylesheet. + + This method will transform original xml using XSLT script into + am ideally flatter xml document for easier parsing and migration + to Data Frame. + """ + from lxml.etree import XSLT, XSLTApplyError, XSLTParseError + + try: + transformer = XSLT(self.xsl_doc) + new_doc = transformer(self.xml_doc) + except (XSLTApplyError, XSLTParseError) as e: + raise e + + return new_doc + + def _validate_path(self) -> None: + """ + Validate xpath. + + This method checks for sytnax, evaluation, or empty nodes return. + + Raises + ------ + SyntaxError + * If xpah is not supported or issues with namespaces. + + Notes + ----- + `etree` supports limited XPath. If user attempts a more complex + expression syntax error will raise. + """ + from lxml.etree import XPathEvalError, XPathSyntaxError + + try: + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces) + attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces) + + if (elems == [] and attrs == [] and children == []) or ( + elems != [] and attrs == [] and children == [] + ): + raise ValueError( + "xpath does not return any nodes. " + "Be sure row level nodes are in xpath. " + "If document uses namespaces denoted with " + "xmlns, be sure to define namespaces and " + "use them in xpath." + ) + except (XPathEvalError, XPathSyntaxError, TypeError) as e: + raise e + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list and aligns with + length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + if self.names: + children = self.xml_doc.xpath( + self.xpath + "[1]/*", namespaces=self.namespaces + ) + + if is_list_like(self.names): + if len(self.names) < len(children): + raise ValueError( + "names does not match length of child elements in xpath." + ) + else: + raise TypeError( + f"{type(self.names).__name__} is not a valid type for names" + ) + + def _convert_io(self) -> Union[None, str]: + """ + Convert filepath_or_buffer object to string. + + This method will convert io object into a string or keep + as string, depending on object type. + """ + + obj = None + + if isinstance(self.raw_doc, str): + obj = self.raw_doc + + if isinstance(self.raw_doc, bytes): + obj = self.raw_doc.decode(self.encoding) + + if isinstance(self.raw_doc, io.StringIO): + obj = self.raw_doc.getvalue() + + if isinstance(self.raw_doc, io.BytesIO): + obj = self.raw_doc.getvalue().decode(self.encoding) + + if isinstance(self.raw_doc, io.TextIOWrapper): + obj = self.raw_doc.read() + + if isinstance(self.raw_doc, io.BufferedReader): + obj = self.raw_doc.read().decode(self.encoding) + + return obj + + def _parse_doc(self): + """ + Build tree from io. + + This method will parse io object into tree for parsing + conditionally by its specific object type. + + Raises + ------ + HttpError + * If URL cannot be reached. + + LookupError + * If xml document has incorrect or unknown encoding. + + OSError + * If file cannot be found. + + XMLSyntaxError + * If xml document conntains syntax issues. + + ValueError + * If io object is not readable as string or file-like object. + """ + + from lxml.etree import XML, XMLParser, XMLSyntaxError, parse + + self.raw_doc = self.stylesheet if self.is_style else self.io + + current_doc = self._convert_io() + if current_doc: + is_xml = current_doc.startswith((" DataFrame: + """ + Convert parsed data to Data Frame. + + This method will bind xml dictionary data of keys and values + into named columns of Data Frame using the built-in TextParser + class that build Data Frame and infers specific dtypes. + """ + + tags = [list(d.keys()) for d in data] + nodes = [list(d.values()) for d in data] + + try: + with TextParser(nodes, names=tags[0], **kwargs) as tp: + return tp.read() + except ParserError: + raise ParserError( + "XML document may be too complex for import. " + "Try to flatten document and use distinct " + "element and attribute names." + ) + + +def _parse( + io, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + parser, + stylesheet, + **kwargs, +) -> DataFrame: + """ + Call internal parsers. + + This method will conditionally call internal parsers: + LxmlFrameParser and/or EtreeParser. + + Raises + ------ + ValueError + * If parser is not lxml or etree.e. + + Notes + ----- + This method will raise a warning instead of module not found or + import error if user does not have 1xml and then reverts to + fallback option with etree parser. + """ + + if parser == "lxml": + try: + p = _LxmlFrameParser( + io, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + stylesheet, + ) + except ImportError: + warn( + "You do not have lxml installed (default parser). " + "Instead, etree will be used.", + ImportWarning, + ) + + p = _EtreeFrameParser( + io, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + stylesheet, + ) + + elif parser == "etree": + p = _EtreeFrameParser( + io, + xpath, + namespaces, + elems_only, + attrs_only, + names, + encoding, + stylesheet, + ) + else: + raise ValueError("Values for parser can only be lxml or etree.") + + data_dicts = p.parse_data() + + return _data_to_frame(data=data_dicts, **kwargs) + + +@deprecate_nonkeyword_arguments(version="2.0") +def read_xml( + io: FilePathOrBuffer, + xpath: Optional[str] = "./*", + namespaces: Optional[Union[dict, List[dict]]] = None, + elems_only: Optional[bool] = False, + attrs_only: Optional[bool] = False, + names: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + parser: Optional[str] = "lxml", + stylesheet: Optional[FilePathOrBuffer[str]] = None, +) -> DataFrame: + r""" + Read XML docuemnts into a ``DataFrame`` object. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + io : str, path object or file-like object + A URL, file-like object, or raw string containing XML. + + xpath : str, optional + The XPath to parse required set of nodes for migration to DataFrame. + XPath should return a collection of elements and not a single + element. Note: The ``etree`` parser supports limited XPath + expressions. For more complex XPath, use ``lxml`` which requires + installation. + + namespaces : dict, optional + The namespaces defined in XML document as dicts with key being + namespace prefix and value the URI. There is no need to include all + namespaces in XML, only the ones used in ``xpath`` expression. + Note: if XML document uses default namespace denoted as + `xmlns=''` without a prefix, you must assign any temporary + namespace, like 'doc', to URI in order to parse any underlying + nodes. For example, :: + + namespaces = {"doc": "https://example.com"} + + elems_only : bool, optional, default = False + Parse only the child elements at the specified ``xpath``. By default, + all child elements and non-empty text nodes are returned. + + attrs_only : bool, optional, default = False + Parse only the attributes at the specified ``xpath``. + By default, all attributes are returned. + + names : list-like, optional + Column names for DataFrame of parsed XML data. Use this parameter to + rename original element names and distinguish same named elements. + + encoding : str, optional, default = 'utf-8' + Encoding of XML document. + + parser : {'lxml','etree'}, default='lxml' + Parser module to use for retrieval of data. Only 'lxml' and + 'etree' are supported. With 'lxml' more complex XPath searches + and ability to use XSLT stylesheet are supported. Default parser + uses 'lxml'. If module is not installed a warning will raise and + process will continue with 'etree'. + + stylesheet : str, path object or file-like object + A URL, file-like object, or a raw string containing an XSLT script. + This stylesheet should flatten complex, deeply nested XML documents. + To use this feature you must have ``lxml`` module installed and use + 'lxml' as ``parser``. The ``xpath`` must reference nodes of + transformed XML document generated after XSLT transformation and not + the original XML document. Only XSLT 1.0 scripts and not later + versions is currently supported. + + Returns + ------- + df + A DataFrame. + + See Also + -------- + read_json : Convert a JSON string to pandas object. + read_html : Read HTML tables into a list of DataFrame objects. + + Notes + ----- + This method is best designed to import shallow XML documents in + following format which is the ideal fit for the two-dimensions of a + ``DataFrame`` (row by column). :: + + + + data + data + data + ... + + + ... + + ... + + + As a file format, XML documents can be designed any way including + layout of elements and attributes as long as it conforms to W3C + specifications. Therefore, this method is a convenience handler for + a specific flatter design and not all possible XML structures. + + However, for more complex XML documents, ``stylesheet`` allows you to + temporarily redesign original document with XSLT (a special purpose + language) for a flatter version for migration to a DataFrame. + + This function will *always* return a single :class:`DataFrame` or raise + exceptions due to issues with XML document, ``xpath``, or other + parameters. + + Examples + -------- + >>> xml = ''' + ... + ... + ... square + ... 360 + ... 4.0 + ... + ... + ... circle + ... 360 + ... + ... + ... + ... triangle + ... 180 + ... 3.0 + ... + ... ''' + + >>> df = pd.read_xml(xml) + + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + >>> xml = ''' + ... + ... + ... + ... + ... "''' + + >>> df = pd.read_xml(xml, xpath=".//row") + + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + >>> xml = ''' + ... + ... + ... square + ... 360 + ... 4.0 + ... + ... + ... circle + ... 360 + ... + ... + ... + ... triangle + ... 180 + ... 3.0 + ... + ... ''' + + >>> df = pd.read(xml, + xpath="//doc:row", + namespaces = {'doc': 'https://example.com'}) + + >>> df + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + """ + + io = stringify_path(io) + + return _parse( + io=io, + xpath=xpath, + namespaces=namespaces, + elems_only=elems_only, + attrs_only=attrs_only, + names=names, + encoding=encoding, + parser=parser, + stylesheet=stylesheet, + ) diff --git a/pandas/tests/io/data/xml/baby_names.xml b/pandas/tests/io/data/xml/baby_names.xml new file mode 100644 index 0000000000000..b4797b79d7112 --- /dev/null +++ b/pandas/tests/io/data/xml/baby_names.xml @@ -0,0 +1,53 @@ + + + + 1 + Jos� + Sof�a + + + 2 + Luis + Valentina + + + 3 + Carlos + Isabella + + + 4 + Juan + Camila + + + 5 + Jorge + Valeria + + + 6 + Pedro + Mariana + + + 7 + Jes�s + Gabriela + + + 8 + Manuel + Sara + + + 9 + Santiago + Daniella + + + 10 + Sebasti�n + Mar�a Jos� + + diff --git a/pandas/tests/io/data/xml/books.xml b/pandas/tests/io/data/xml/books.xml new file mode 100644 index 0000000000000..666ce60e9a2be --- /dev/null +++ b/pandas/tests/io/data/xml/books.xml @@ -0,0 +1,21 @@ + + + + Everyday Italian + Giada De Laurentiis + 2005 + 30.00 + + + Harry Potter + J K. Rowling + 2005 + 29.99 + + + Learning XML + Erik T. Ray + 2003 + 39.95 + + diff --git a/pandas/tests/io/data/xml/cta_rail_lines.kml b/pandas/tests/io/data/xml/cta_rail_lines.kml new file mode 100644 index 0000000000000..c031137ee7b20 --- /dev/null +++ b/pandas/tests/io/data/xml/cta_rail_lines.kml @@ -0,0 +1,92 @@ + + + CTA_RailLines + + + CTA_RailLines + + + Blue Line (Forest Park) + +
Blue Line (Forest Park)
OBJECTID_1 1
ASSET_ID 21100001
LINES Blue Line (Forest Park)
DESCRIPTIO Oak Park to Austin
TYPE Elevated or at Grade
LEGEND BL
ALT_LEGEND BL
BRANCH Blue Line Forest Park
SHAPE.LEN 4060.368778
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.77678526964958,41.8708863930319,0 -87.77826234150609,41.87097820122218,0 -87.78251583439344,41.87130129991005,0 -87.78418294588424,41.87145055520308,0 -87.7872369165933,41.8717239119163,0 -87.79160214925886,41.87210797280065,0 + + +
+ + Red, Purple Line + +
Red, Purple Line
OBJECTID_1 2
ASSET_ID 21100002
LINES Red, Purple Line
DESCRIPTIO Lawrence to Wilson
TYPE Elevated or at Grade
LEGEND RD
ALT_LEGEND RDPR
BRANCH Red Line North Side
SHAPE.LEN 1800.132896
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.65758750947528,41.96427269188822,0 -87.65802133507393,41.96581929055245,0 -87.65819033925305,41.96621846093642,0 -87.6583189819129,41.96650362897086,0 -87.65835858701473,41.96669002089185,0 -87.65838428411853,41.96688150295095,0 -87.65842208882658,41.96745896091846,0 -87.65846556843937,41.9683761425439,0 -87.65849296214573,41.96913893870342,0 + + +
+ + Red, Purple Line + +
Red, Purple Line
OBJECTID_1 3
ASSET_ID 21100003
LINES Red, Purple Line
DESCRIPTIO Wilson to Sheridan
TYPE Elevated or at Grade
LEGEND RD
ALT_LEGEND RDPR
BRANCH Red Line North Side
SHAPE.LEN 4256.243677
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.65492939166126,41.95377494531437,0 -87.65557043199591,41.95376544118533,0 -87.65606302030132,41.95376391658746,0 -87.65623502146268,41.95377379126367,0 -87.65634748981634,41.95380103566435,0 -87.65646537904269,41.95387703994676,0 -87.65656532461145,41.95396622645799,0 -87.65664760856414,41.95404201996044,0 -87.65671750555913,41.95416647054043,0 -87.65673983607117,41.95429949810849,0 -87.65673866475777,41.95441024240925,0 -87.6567690255541,41.95490657227902,0 -87.65683672482363,41.95692259283837,0 -87.6568900886376,41.95861070983142,0 -87.65699865558875,41.96181418669004,0 -87.65756347177603,41.96397045777844,0 -87.65758750947528,41.96427269188822,0 + + +
+ + Red, Purple Line + +
Red, Purple Line
OBJECTID_1 4
ASSET_ID 21100004
LINES Red, Purple Line
DESCRIPTIO Sheridan to Addison
TYPE Elevated or at Grade
LEGEND RD
ALT_LEGEND RDPR
BRANCH Red Line North Side
SHAPE.LEN 2581.713736
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.65362593118043,41.94742799535678,0 -87.65363554415794,41.94819886386848,0 -87.6536456393239,41.95059994675451,0 -87.65365831235026,41.95108288489359,0 -87.6536604873874,41.9519954657554,0 -87.65362592053201,41.95245597302328,0 -87.65367158496069,41.95311153649393,0 -87.65368468595476,41.9533202828916,0 -87.65369271253692,41.95343095587119,0 -87.65373335834569,41.95351536301472,0 -87.65378605844126,41.95358212680591,0 -87.65385067928185,41.95364452823767,0 -87.6539390793817,41.95370263886964,0 -87.6540786298351,41.95373403675265,0 -87.65430648647626,41.9537535411832,0 -87.65492939166126,41.95377494531437,0 + + +
+ + Red, Purple Line + +
Red, Purple Line
OBJECTID_1 5
ASSET_ID 21100005
LINES Red, Purple Line
DESCRIPTIO Addison to Clark Junction
TYPE Elevated or at Grade
LEGEND RD
ALT_LEGEND RDPR
BRANCH Red Line North Side
SHAPE.LEN 1918.716686
]]>
+ #LineStyle01 + + + 0 + clampedToGround + -87.65345391792157,41.94217681262115,0 -87.65342448305786,41.94237224420864,0 -87.65339745703922,41.94268217746244,0 -87.65337753982941,41.94288140770284,0 -87.65336256753105,41.94317369618263,0 -87.65338799707138,41.94357253961736,0 -87.65340240886648,41.94389158188269,0 -87.65341837392448,41.94406444407721,0 -87.65342275247338,41.94421065714904,0 -87.65347469646018,41.94434829382345,0 -87.65351486483024,41.94447699917548,0 -87.65353483605053,41.9453896864472,0 -87.65361975532807,41.94689193720703,0 -87.65362593118043,41.94742799535678,0 + + +
+
+ +
+
diff --git a/pandas/tests/io/data/xml/flatten_doc.xsl b/pandas/tests/io/data/xml/flatten_doc.xsl new file mode 100644 index 0000000000000..a9d62d180beaf --- /dev/null +++ b/pandas/tests/io/data/xml/flatten_doc.xsl @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + diff --git a/pandas/tests/io/data/xml/row_field_output.xsl b/pandas/tests/io/data/xml/row_field_output.xsl new file mode 100644 index 0000000000000..5a0f0e655a78e --- /dev/null +++ b/pandas/tests/io/data/xml/row_field_output.xsl @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py new file mode 100644 index 0000000000000..62958894981fd --- /dev/null +++ b/pandas/tests/io/formats/test_to_xml.py @@ -0,0 +1,1099 @@ +from io import BytesIO, StringIO + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.xml import read_xml + +geom_df = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } +) + +planet_df = DataFrame( + { + "planet": [ + "Mercury", + "Venus", + "Earth", + "Mars", + "Jupiter", + "Saturn", + "Uranus", + "Neptune", + ], + "type": [ + "terrestrial", + "terrestrial", + "terrestrial", + "terrestrial", + "gas giant", + "gas giant", + "ice giant", + "ice giant", + ], + "location": [ + "inner", + "inner", + "inner", + "inner", + "outer", + "outer", + "outer", + "outer", + ], + "mass": [ + 0.330114, + 4.86747, + 5.97237, + 0.641712, + 1898.187, + 568.3174, + 86.8127, + 102.4126, + ], + } +) + +from_file_expected = """\ + + + + 0 + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + 1 + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + 2 + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_file_output_str_read(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, parser=parser) + + with tm.ensure_clean("test.xml") as path: + df_file.to_xml(path, parser=parser) + with open(path, "rb") as f: + output = f.read().decode("utf-8").strip() + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, parser=parser) + + with tm.ensure_clean("test.xml") as path: + df_file.to_xml(path, index=False, parser=parser) + with open(path, "rb") as f: + output = f.read().decode("utf-8").strip() + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, parser=parser) + + with tm.ensure_clean("test.xml") as path: + df_file.to_xml( + path, index=False, root_name="books", row_name="book", parser=parser + ) + with open(path, "rb") as f: + output = f.read().decode("utf-8").strip() + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_na_elem_output(datapath, parser): + output = geom_df.to_xml(parser=parser) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + 0.0 + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(na_rep="0.0", parser=parser) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + + +""" + + output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + + +""" + + output = geom_df.to_xml( + attr_cols=["index", "shape", "degrees", "sides"], + namespaces={"doc": "http://example.xom"}, + prefix="doc", + parser=parser, + ) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + 360 + 4.0 + square + + + 360 + + circle + + + 180 + 3.0 + triangle + +""" + + output = geom_df.to_xml( + index=False, elem_cols=["degrees", "sides", "shape"], parser=parser + ) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + 360 + 4.0 + + + 360 + + + + 180 + 3.0 + +""" + + output = geom_df.to_xml( + index=False, + elem_cols=["degrees", "sides"], + attr_cols=["shape"], + parser=parser, + ) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + inner + terrestrial + 4 + 11.811666 + 2.9529165 + + + outer + gas giant + 2 + 2466.5044 + 1233.2522 + + + outer + ice giant + 2 + 189.2253 + 94.61265 + + + All + + 8 + 2667.541366 + 333.44267075 + +""" + + pvt = planet_df.pivot_table( + index=["location", "type"], + values="mass", + aggfunc=["count", "sum", "mean"], + margins=True, + ) + + output = pvt.to_xml(parser=parser) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + + + +""" + + pvt = planet_df.pivot_table( + index=["location", "type"], + values="mass", + aggfunc=["count", "sum", "mean"], + margins=True, + ) + + output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + inner + terrestrial + 4 + 11.811666 + 2.9529165 + + + outer + gas giant + 2 + 2466.5044 + 1233.2522 + + + outer + ice giant + 2 + 189.2253 + 94.61265 + +""" + + agg = planet_df.groupby(["location", "type"])["mass"].agg(["count", "sum", "mean"]) + + output = agg.to_xml(parser=parser) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + + +""" + + agg = planet_df.groupby(["location", "type"])["mass"].agg(["count", "sum", "mean"]) + + output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml( + namespaces={"doc": "http://example.com"}, prefix="doc", parser=parser + ) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml( + namespaces={"": "http://example.com", "doc": "http://other.org"}, + prefix="doc", + parser=parser, + ) + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + + 0 + 1 + José + Sofía + + + 1 + 2 + Luis + Valentina + + + 2 + 3 + Carlos + Isabella + + + 3 + 4 + Juan + Camila + + + 4 + 5 + Jorge + Valeria + +""" + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_encoding_option_str(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + df_file = read_xml(filename, parser=parser, encoding="ISO-8859-1").head(5) + + output = df_file.to_xml(encoding="ISO-8859-1") + + # etree and lxml differs on quotes and case in xml declaration + output = output.replace( + ' + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(xml_declaration=False, parser=parser) + + assert output == expected + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_no_pretty_print_with_decl(parser): + expected = ( + "\n" + "0square" + "3604.0" + "1circle360" + "2" + "triangle1803.0" + "" + ) + + output = geom_df.to_xml(pretty_print=False) + + output = output.replace( + '0square" + "3604.0" + "1circle360" + "2" + "triangle1803.0" + "" + ) + + output = geom_df.to_xml(xml_declaration=False, pretty_print=False) + + assert output == expected + + +xsl_expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_stylesheet_file_like(datapath, mode): + xsl = datapath("io", "data", "xml", "row_field_output.xsl") + + with open(xsl, mode) as f: + assert geom_df.to_xml(stylesheet=f) == xsl_expected + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_stylesheet_io(datapath, mode): + xsl = datapath("io", "data", "xml", "row_field_output.xsl") + + with open(xsl, mode) as f: + xsl_obj = f.read() + + xsl_io = BytesIO(xsl_obj) if isinstance(xsl_obj, bytes) else StringIO(xsl_obj) + + output = geom_df.to_xml(stylesheet=xsl_io) + + assert output == xsl_expected + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_stylesheet_buffered_reader(datapath, mode): + xsl = datapath("io", "data", "xml", "row_field_output.xsl") + + with open(xsl, mode) as f: + xsl_obj = f.read() + + output = geom_df.to_xml(stylesheet=xsl_obj) + + assert output == xsl_expected + + +@td.skip_if_no("lxml") +def test_style_to_csv(): + xsl = """\ + + + + + , + + ,shape,degrees,sides + + + + + + + +""" + + out_csv = geom_df.to_csv().strip() + out_xml = geom_df.to_xml(stylesheet=xsl) + + assert out_csv == out_xml + + +@td.skip_if_no("lxml") +def test_style_to_string(): + xsl = """\ + + + + + + + shape degrees sides + + + + + + + +""" + + out_str = geom_df.to_string() + out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl) + + assert out_xml == out_str + + +@td.skip_if_no("lxml") +def test_style_to_json(): + xsl = """\ + + + + + " + + + {"shape":{ + + },"degrees":{ + + },"sides":{ + + }} + + + + + + + + + + + + + + + + + , + + +""" + + out_json = geom_df.to_json() + out_xml = geom_df.to_xml(stylesheet=xsl) + + assert out_json == out_xml + + +@pytest.mark.skip( + reason="incorrect tag in from to_html() to be skipped until fix" +) +def test_style_to_html(): + xsl = """\ + + + + + + + + + + + + + + + + + +
shapedegreessides
+
+ + + + + + + + + + + + +
""" + + out_html = geom_df.to_html() + out_xml = geom_df.to_xml(stylesheet=xsl) + + assert out_html == out_xml diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py new file mode 100644 index 0000000000000..375e9c2472742 --- /dev/null +++ b/pandas/tests/io/test_xml.py @@ -0,0 +1,708 @@ +from io import BytesIO, StringIO +import os +from urllib.error import HTTPError + +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.xml import read_xml + +xml_default_nmsp = """\ + + + + square + 360 + 4 + + + circle + 360 + + + + triangle + 180 + 3 + +""" + +xml_prefix_nmsp = """\ + + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + +""" + + +def test_parser_consistency_file(datapath): + filename = datapath("io", "data", "xml", "books.xml") + df_file_lxml = read_xml(filename, parser="lxml") + df_file_etree = read_xml(filename, parser="etree") + + tm.assert_frame_equal(df_file_lxml, df_file_etree) + + +@tm.network +@pytest.mark.slow +def test_parser_consistency_url(datapath): + url = ( + "https://data.cityofchicago.org/api/views/" + "8pix-ypme/rows.xml?accessType=DOWNLOAD" + ) + df_file_lxml = read_xml(url, xpath=".//row/row", parser="lxml") + df_file_etree = read_xml(url, xpath=".//row/row", parser="etree") + + tm.assert_frame_equal(df_file_lxml, df_file_etree) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_file_like(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with open(filename, mode) as f: + df_file = read_xml(f, parser=parser) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_file_io(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with open(filename, mode) as f: + xml_obj = f.read() + + df_io = read_xml( + (BytesIO(xml_obj) if isinstance(xml_obj, bytes) else StringIO(xml_obj)), + parser=parser, + ) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_io, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_file_buffered_reader_string(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with open(filename, mode) as f: + xml_obj = f.read() + + df_str = read_xml(xml_obj, parser=parser) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_str, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with open(filename, mode) as f: + next(f) + xml_obj = f.read() + + df_str = read_xml(xml_obj, parser=parser) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_str, df_expected) + + +def test_wrong_file_lxml(datapath): + with pytest.raises(OSError, match=("failed to load external entity")): + filename = os.path.join("data", "html", "books.xml") + read_xml(filename, parser="lxml") + + +def test_wrong_file_etree(datapath): + with pytest.raises(OSError, match=("No such file")): + filename = os.path.join("data", "html", "books.xml") + read_xml(filename, parser="etree") + + +@tm.network +@td.skip_if_no("lxml") +def test_url(): + url = "https://www.w3schools.com/xml/books.xml" + df_url = read_xml(url, xpath=".//book[count(*)=4]") + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + "cover": [None, None, "paperback"], + } + ) + + tm.assert_frame_equal(df_url, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_wrong_url(parser): + with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")): + url = "https://www.w3schools.com/xml/python.xml" + read_xml(url, xpath=".//book[count(*)=4]", parser=parser) + + +def test_empty_xpath_lxml(datapath): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(ValueError, match=("xpath does not return any nodes")): + read_xml(filename, xpath=".//python", parser="lxml") + + +def test_bad_xpath_etree(datapath): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + SyntaxError, match=("You have used an incorrect or unsupported XPath") + ): + read_xml(filename, xpath=".//[book]", parser="etree") + + +@td.skip_if_no("lxml") +def test_bad_xpath_lxml(datapath): + from lxml.etree import XPathEvalError + + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(XPathEvalError, match=("Invalid expression")): + read_xml(filename, xpath=".//[book]", parser="lxml") + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_default_namespace(parser): + df_nmsp = read_xml( + xml_default_nmsp, + xpath=".//ns:row", + namespaces={"ns": "http://example.com"}, + parser=parser, + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_nmsp, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_prefix_namespace(parser): + df_nmsp = read_xml( + xml_prefix_nmsp, + xpath=".//doc:row", + namespaces={"doc": "http://example.com"}, + parser=parser, + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_nmsp, df_expected) + + +def test_consistency_default_namespace(): + df_lxml = read_xml( + xml_default_nmsp, + xpath=".//ns:row", + namespaces={"ns": "http://example.com"}, + parser="lxml", + ) + + df_etree = read_xml( + xml_default_nmsp, + xpath=".//doc:row", + namespaces={"doc": "http://example.com"}, + parser="etree", + ) + + tm.assert_frame_equal(df_lxml, df_etree) + + +def test_consistency_prefix_namespace(): + df_lxml = read_xml( + xml_prefix_nmsp, + xpath=".//doc:row", + namespaces={"doc": "http://example.com"}, + parser="lxml", + ) + + df_etree = read_xml( + xml_prefix_nmsp, + xpath=".//doc:row", + namespaces={"doc": "http://example.com"}, + parser="etree", + ) + + tm.assert_frame_equal(df_lxml, df_etree) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_missing_prefix_with_default_namespace(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(ValueError, match=("xpath does not return any nodes")): + read_xml(filename, xpath=".//Placemark", parser=parser) + + +def test_missing_prefix_definition_etree(datapath): + filename = datapath("io", "data", "xml", "cta_rail_lines.kml") + with pytest.raises(SyntaxError, match=("you used an undeclared namespace prefix")): + read_xml(filename, xpath=".//kml:Placemark", parser="etree") + + +@td.skip_if_no("lxml") +def test_missing_prefix_definition_lxml(datapath): + from lxml.etree import XPathEvalError + + filename = datapath("io", "data", "xml", "cta_rail_lines.kml") + with pytest.raises(XPathEvalError, match=("Undefined namespace prefix")): + read_xml(filename, xpath=".//kml:Placemark", parser="lxml") + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("key", ["", None]) +def test_none_namespace_prefix(key): + with pytest.raises( + TypeError, match=("empty namespace prefix is not supported in XPath") + ): + read_xml( + xml_default_nmsp, + xpath=".//kml:Placemark", + namespaces={key: "http://www.opengis.net/kml/2.2"}, + parser="lxml", + ) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_file_elems_and_attrs(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, parser=parser) + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_file_only_attrs(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, attrs_only=True, parser=parser) + df_expected = DataFrame({"category": ["cooking", "children", "web"]}) + + tm.assert_frame_equal(df_file, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_file_only_elems(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml(filename, elems_only=True, parser=parser) + df_expected = DataFrame( + { + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_elem_and_attrs_only(datapath, parser): + filename = datapath("io", "data", "xml", "cta_rail_lines.kml") + with pytest.raises( + ValueError, + match=("Either element or attributes can be parsed not both"), + ): + read_xml(filename, elems_only=True, attrs_only=True, parser=parser) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_names_option_output(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_file = read_xml( + filename, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser + ) + + df_expected = DataFrame( + { + "Col1": ["cooking", "children", "web"], + "Col2": ["Everyday Italian", "Harry Potter", "Learning XML"], + "Col3": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "Col4": [2005, 2005, 2003], + "Col5": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_names_option_wrong_length(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.raises(ValueError, match=("names does not match length")): + read_xml(filename, names=["Col1", "Col2", "Col3"], parser=parser) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_names_option_wrong_type(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.raises(TypeError, match=("is not a valid type for names")): + read_xml(filename, names="Col1, Col2, Col3", parser=parser) + + +@td.skip_if_no("lxml") +def test_wrong_encoding_lxml(datapath): + from lxml.etree import XMLSyntaxError + + filename = datapath("io", "data", "xml", "baby_names.xml") + with pytest.raises(XMLSyntaxError, match=("Input is not proper UTF-8")): + read_xml(filename) + + +@td.skip_if_no("lxml") +def test_utf16_encoding_lxml(datapath): + from lxml.etree import XMLSyntaxError + + filename = datapath("io", "data", "xml", "baby_names.xml") + with pytest.raises(XMLSyntaxError, match=("Start tag expected, '<' not found")): + read_xml(filename, encoding="UTF-16") + + +@td.skip_if_no("lxml") +def test_unknown_encoding_lxml(datapath): + filename = datapath("io", "data", "xml", "baby_names.xml") + with pytest.raises(LookupError, match=("unknown encoding")): + read_xml(filename, encoding="UFT-8") + + +# etree raises no error on wrong, utf-16, or unknown encoding +@pytest.mark.parametrize("encoding", [None, "UTF-16", "UFT-8"]) +def test_wrong_encoding_etree(datapath, encoding): + filename = datapath("io", "data", "xml", "baby_names.xml") + read_xml(filename, parser="etree", encoding=encoding) + + +@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_ascii_encoding(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + read_xml(filename, encoding="ascii", parser=parser) + + +def test_parser_consistency_with_encoding(datapath): + filename = datapath("io", "data", "xml", "baby_names.xml") + df_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") + df_etree = read_xml(filename, parser="etree") + + tm.assert_frame_equal(df_lxml, df_etree) + + +def test_attribute_centric_xml(): + xml = """\ + + + + + + + + + + + + + + + + + +""" + + df_lxml = read_xml(xml, xpath=".//station") + df_etree = read_xml(xml, xpath=".//station", parser="etree") + + tm.assert_frame_equal(df_lxml, df_etree) + + +def test_wrong_parser(datapath): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.raises( + ValueError, match=("Values for parser can only be lxml or etree.") + ): + read_xml(filename, parser="bs4") + + +@td.skip_if_no("lxml") +def test_stylesheet_file(datapath): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + data = { + "id": { + 0: "ID_00001", + 1: "ID_00002", + 2: "ID_00003", + 3: "ID_00004", + 4: "ID_00005", + }, + "name": { + 0: "Blue Line (Forest Park)", + 1: "Red, Purple Line", + 2: "Red, Purple Line", + 3: "Red, Purple Line", + 4: "Red, Purple Line", + }, + "styleUrl": { + 0: "#LineStyle01", + 1: "#LineStyle01", + 2: "#LineStyle01", + 3: "#LineStyle01", + 4: "#LineStyle01", + }, + "extrude": {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}, + "altitudeMode": { + 0: "clampedToGround", + 1: "clampedToGround", + 2: "clampedToGround", + 3: "clampedToGround", + 4: "clampedToGround", + }, + "coordinates": { + 0: ( + "-87.77678526964958,41.8708863930319,0 " + "-87.77826234150609,41.87097820122218,0 " + "-87.78251583439344,41.87130129991005,0 " + "-87.78418294588424,41.87145055520308,0 " + "-87.7872369165933,41.8717239119163,0 " + "-87.79160214925886,41.87210797280065,0" + ), + 1: ( + "-87.65758750947528,41.96427269188822,0 " + "-87.65802133507393,41.96581929055245,0 " + "-87.65819033925305,41.96621846093642,0 " + "-87.6583189819129,41.96650362897086,0 " + "-87.65835858701473,41.96669002089185,0 " + "-87.65838428411853,41.96688150295095,0 " + "-87.65842208882658,41.96745896091846,0 " + "-87.65846556843937,41.9683761425439,0 " + "-87.65849296214573,41.96913893870342,0" + ), + 2: ( + "-87.65492939166126,41.95377494531437,0 " + "-87.65557043199591,41.95376544118533,0 " + "-87.65606302030132,41.95376391658746,0 " + "-87.65623502146268,41.95377379126367,0 " + "-87.65634748981634,41.95380103566435,0 " + "-87.65646537904269,41.95387703994676,0 " + "-87.65656532461145,41.95396622645799,0 " + "-87.65664760856414,41.95404201996044,0 " + "-87.65671750555913,41.95416647054043,0 " + "-87.65673983607117,41.95429949810849,0 " + "-87.65673866475777,41.95441024240925,0 " + "-87.6567690255541,41.95490657227902,0 " + "-87.65683672482363,41.95692259283837,0 " + "-87.6568900886376,41.95861070983142,0 " + "-87.65699865558875,41.96181418669004,0 " + "-87.65756347177603,41.96397045777844,0 " + "-87.65758750947528,41.96427269188822,0" + ), + 3: ( + "-87.65362593118043,41.94742799535678,0 " + "-87.65363554415794,41.94819886386848,0 " + "-87.6536456393239,41.95059994675451,0 " + "-87.65365831235026,41.95108288489359,0 " + "-87.6536604873874,41.9519954657554,0 " + "-87.65362592053201,41.95245597302328,0 " + "-87.65367158496069,41.95311153649393,0 " + "-87.65368468595476,41.9533202828916,0 " + "-87.65369271253692,41.95343095587119,0 " + "-87.65373335834569,41.95351536301472,0 " + "-87.65378605844126,41.95358212680591,0 " + "-87.65385067928185,41.95364452823767,0 " + "-87.6539390793817,41.95370263886964,0 " + "-87.6540786298351,41.95373403675265,0 " + "-87.65430648647626,41.9537535411832,0 " + "-87.65492939166126,41.95377494531437,0" + ), + 4: ( + "-87.65345391792157,41.94217681262115,0 " + "-87.65342448305786,41.94237224420864,0 " + "-87.65339745703922,41.94268217746244,0 " + "-87.65337753982941,41.94288140770284,0 " + "-87.65336256753105,41.94317369618263,0 " + "-87.65338799707138,41.94357253961736,0 " + "-87.65340240886648,41.94389158188269,0 " + "-87.65341837392448,41.94406444407721,0 " + "-87.65342275247338,41.94421065714904,0 " + "-87.65347469646018,41.94434829382345,0 " + "-87.65351486483024,41.94447699917548,0 " + "-87.65353483605053,41.9453896864472,0 " + "-87.65361975532807,41.94689193720703,0 " + "-87.65362593118043,41.94742799535678,0" + ), + }, + } + + df_expected = DataFrame(data) + df_style = read_xml( + kml, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=xsl, + ) + + tm.assert_frame_equal(df_expected, df_style) + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_stylesheet_file_like(datapath, mode): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + with open(xsl, mode) as f: + read_xml(kml, stylesheet=f) + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_stylesheet_io(datapath, mode): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + with open(xsl, mode) as f: + xsl_obj = f.read() + + xsl_io = BytesIO(xsl_obj) if isinstance(xsl_obj, bytes) else StringIO(xsl_obj) + read_xml(kml, stylesheet=xsl_io) + + +@td.skip_if_no("lxml") +@pytest.mark.parametrize("mode", ["rb", "r"]) +def test_stylesheet_buffered_reader(datapath, mode): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + with open(xsl, mode) as f: + xsl_obj = f.read() + + read_xml(kml, stylesheet=xsl_obj) + + +@td.skip_if_no("lxml") +def test_wrong_stylesheet(datapath): + kml = os.path.join("data", "xml", "cta_rail_lines.kml") + xsl = os.path.join("data", "xml", "flatten.xsl") + + with pytest.raises(OSError, match=("failed to load external entity")): + read_xml(kml, stylesheet=xsl) + + +@tm.network +@td.skip_if_no("lxml") +def test_online_stylesheet(): + xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml" + xsl = "https://www.w3schools.com/xml/cdcatalog.xsl" + + df_xsl = read_xml( + xml, + xpath=".//tr[td and position() <= 6]", + names=["title", "artist"], + stylesheet=xsl, + ) + + df_expected = DataFrame( + { + "title": { + 0: "Empire Burlesque", + 1: "Hide your heart", + 2: "Greatest Hits", + 3: "Still got the blues", + 4: "Eros", + }, + "artist": { + 0: "Bob Dylan", + 1: "Bonnie Tyler", + 2: "Dolly Parton", + 3: "Gary Moore", + 4: "Eros Ramazzotti", + }, + } + ) + + tm.assert_frame_equal(df_expected, df_xsl) From cd79a06871aacd0fa1a640389a043cce501c614c Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 2 Feb 2021 18:53:13 -0600 Subject: [PATCH 02/35] Refactor code for base classes, add tests, adjust whatsnew entry --- doc/source/whatsnew/v1.3.0.rst | 53 +++- pandas/core/frame.py | 6 +- pandas/io/formats/format.py | 15 +- pandas/io/formats/xml.py | 409 ++++++++----------------- pandas/io/xml.py | 403 ++++++++++-------------- pandas/tests/io/formats/test_to_xml.py | 361 +++++++++++++++------- pandas/tests/io/test_xml.py | 267 ++++++++++++---- 7 files changed, 814 insertions(+), 700 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 3a6ac281829fe..85b272767e642 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -41,34 +41,71 @@ See ref:`window.overview` for performance and functional benefits. (:issue:`1509 .. _whatsnew_130.read_to_xml: -We added I/O support to read and render shallow versions of XML documents with -:func:`pandas.read_xml` and :meth:`DataFrame.to_xml`. Using lxml as parser, +We added I/O support to read and render shallow versions of XML documents with +:func:`pandas.read_xml` and :meth:`DataFrame.to_xml`. Using lxml as parser, +full XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) +======= +We added I/O support to read and render shallow versions of `XML`_ documents with +:func:`pandas.read_xml` and :meth:`DataFrame.to_xml`. Using `lxml`_ as parser, full XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) -.. ipython:: python +.. _XML: https://www.w3.org/standards/xml/core +.. _lxml: https://lxml.de - xml = """ +.. code-block:: ipython + + In [1]: xml = """ + ...: + ...: + ...: square + ...: 360 + ...: 4.0 + ...: + ...: + ...: circle + ...: 360 + ...: + ...: + ...: + ...: triangle + ...: 180 + ...: 3.0 + ...: + ...: """ + + In [2]: df = pd.read_xml(xml) + In [3]: df + Out[3]: + shape degrees sides + 0 square 360 4.0 + 1 circle 360 NaN + 2 triangle 180 3.0 + + In [4]: df.to_xml() + Out[4]: + + 0 square 360 4.0 + 1 circle 360 + 2 triangle 180 3.0 - """ - - df = pd.read_xml(xml) + - df.to_xml() +For more, see :ref:`io` in the user guide on IO tools. .. _whatsnew_130.enhancements.other: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2b3427943cda0..6737f7151bf03 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2606,7 +2606,7 @@ def to_html( def to_xml( self, - io: Optional[FilePathOrBuffer[str]] = None, + path_or_buffer: Optional[FilePathOrBuffer[str]] = None, index: Optional[bool] = True, root_name: Optional[str] = "data", row_name: Optional[str] = "row", @@ -2628,7 +2628,7 @@ def to_xml( Parameters ---------- - io : str, path object or file-like object, optional + path_or_buffer : str, path object or file-like object, optional File to write output to. If None, the output is returned as a string. index : bool, optional @@ -2760,7 +2760,7 @@ def to_xml( ) return fmt.DataFrameRenderer(formatter).to_xml( - io=io, + path_or_buffer=path_or_buffer, index=index, root_name=root_name, row_name=row_name, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b08b66ba46d61..ebf3cf0852575 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -51,6 +51,7 @@ IndexLabel, StorageOptions, ) +from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -1007,7 +1008,7 @@ def to_html( def to_xml( self, - io: Optional[FilePathOrBuffer[str]] = None, + path_or_buffer: Optional[FilePathOrBuffer[str]] = None, index: Optional[bool] = True, root_name: Optional[str] = "data", row_name: Optional[str] = "row", @@ -1029,7 +1030,7 @@ def to_xml( Parameters ---------- - io : str, path object or file-like object, optional + path_or_buffer : str, path object or file-like object, optional File to write output to. If None, the output is returned as a string. index : bool, optional @@ -1084,10 +1085,14 @@ def to_xml( from pandas.io.formats.xml import EtreeXMLFormatter, LxmlXMLFormatter + lxml = import_optional_dependency( + "lxml.etree", raise_on_missing=False, on_version="ignore" + ) + if parser == "lxml": - try: + if lxml is not None: TreeBuilder = LxmlXMLFormatter - except ImportError: + else: warn( "You do not have lxml installed (default parser). " "Instead, etree will be used.", @@ -1103,7 +1108,7 @@ def to_xml( xml_formatter = TreeBuilder( self.fmt, - io=io, + path_or_buffer=path_or_buffer, index=index, root_name=root_name, row_name=row_name, diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 86448c9d4498f..90ee289ad3414 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -1,5 +1,5 @@ """ -Module for formatting output data in XML. +:mod:`pandas.io.formats.xml` is a module for formatting data in XML. """ import codecs @@ -9,6 +9,7 @@ from warnings import warn from pandas._typing import FilePathOrBuffer +from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import is_list_like @@ -16,14 +17,13 @@ from pandas.io.formats.format import DataFrameFormatter -class EtreeXMLFormatter: +class BaseXMLFormatter: """ - Class for formatting data in xml using Python standard library - modules: `xml.etree.ElementTree` and `xml.dom.minidom`. + Subclass for formatting data in XML. Parameters ---------- - io : str or file-like + path_or_buffer : str or file-like This can be either a string of raw XML, a valid URL, file or file-like object. @@ -34,7 +34,7 @@ class EtreeXMLFormatter: Name for root of xml document. Default is 'data'. root_name : str - Name for row elemens of xml document. Default is 'row'. + Name for row elements of xml document. Default is 'row'. na_rep : str Missing data representation. @@ -62,23 +62,19 @@ class EtreeXMLFormatter: Whether to write xml document with line breaks and indentation. stylesheet : str or file-like - A URL, file, file-like object, or a raw string containing XSLT, - `etree` does not support XSLT but retained for consistency. + A URL, file, file-like object, or a raw string containing XSLT. See also -------- + pandas.io.formats.xml.EtreeXMLFormatter pandas.io.formats.xml.LxmlXMLFormatter - Notes - ----- - This class serves as fall back option if user does not have - ``lxml`` installed or user specifically requests ``etree`` parser. """ def __init__( self, formatter: DataFrameFormatter, - io: Optional[FilePathOrBuffer[str]] = None, + path_or_buffer: Optional[FilePathOrBuffer[str]] = None, index: Optional[bool] = True, root_name: Optional[str] = "data", row_name: Optional[str] = "row", @@ -93,7 +89,7 @@ def __init__( stylesheet: Optional[FilePathOrBuffer[str]] = None, ) -> None: self.fmt = formatter - self.io = io + self.path_or_buffer = path_or_buffer self.index = index self.root_name = root_name self.row_name = row_name @@ -108,13 +104,6 @@ def __init__( self.stylesheet = stylesheet self.frame = self.fmt.frame - self.validate_columns() - self.validate_encoding() - self.orig_cols = self.fmt.frame.columns.tolist() - self.frame_dicts = self.process_dataframe() - self.handle_indexes() - self.prefix_uri = self.get_prefix_uri() - def build_tree(self) -> bytes: """ Build tree from data. @@ -122,40 +111,7 @@ def build_tree(self) -> bytes: This method initializes the root and builds attributes and elements with optional namespaces. """ - from xml.etree.ElementTree import Element, SubElement, tostring - - self.root = Element( - f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces() - ) - - for k, d in self.frame_dicts.items(): - self.d = d - self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") - - if self.attr_cols: - self.build_attribs() - if self.elem_cols: - self.build_elems() - if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(self.frame_dicts[0].keys()) - self.build_elems() - - self.out_xml = tostring(self.root, method="xml", encoding=self.encoding) - - if self.pretty_print: - self.out_xml = self.prettify_tree() - - if not self.xml_declaration: - self.out_xml = self.remove_declaration() - - if self.stylesheet: - warn( - "To use stylesheet, you need lxml installed. " - "The non-transformed, original XML is returned instead.", - UserWarning, - ) - - return self.out_xml + raise AbstractMethodError(self) def validate_columns(self) -> None: """ @@ -233,23 +189,14 @@ def get_prefix_uri(self) -> str: Get uri of namespace prefix. This method retrieves corresponding URI to prefix in namespaces. - """ - from xml.etree.ElementTree import register_namespace - - uri = "" - if self.namespaces: - for p, n in self.namespaces.items(): - register_namespace(p, n) - if self.prefix: - try: - uri = f"{{{self.namespaces[self.prefix]}}}" - except (KeyError): - raise KeyError("prefix is not included in namespaces") - else: - uri = f'{{{self.namespaces[""]}}}' + Raises + ------ + KeyError + *If prefix is not included in namespace dict. + """ - return uri + raise AbstractMethodError(self) def other_namespaces(self) -> dict: """ @@ -277,6 +224,109 @@ def build_attribs(self) -> None: works with tuples for multindex or hierarchical columns. """ + raise AbstractMethodError(self) + + def build_elems(self) -> None: + """ + Create child elements of row. + + This method adds child elements using elem_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + + raise AbstractMethodError(self) + + def write_output(self) -> Optional[str]: + xml_doc = self.build_tree() + + try: + if self.path_or_buffer: + with open(self.path_or_buffer, "wb") as f: + f.write(xml_doc) + xml_doc = None + else: + xml_doc = xml_doc.decode(self.encoding).rstrip() + except (UnicodeDecodeError, OSError) as e: + raise e + + return xml_doc + + +class EtreeXMLFormatter(BaseXMLFormatter): + """ + Class for formatting data in xml using Python standard library + modules: `xml.etree.ElementTree` and `xml.dom.minidom`. + + Notes + ----- + This class serves as fall back option if user does not have + ``lxml`` installed or user specifically requests ``etree`` parser. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.validate_columns() + self.validate_encoding() + self.orig_cols = self.fmt.frame.columns.tolist() + self.frame_dicts = self.process_dataframe() + self.handle_indexes() + self.prefix_uri = self.get_prefix_uri() + + def build_tree(self) -> bytes: + from xml.etree.ElementTree import Element, SubElement, tostring + + self.root = Element( + f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces() + ) + + for k, d in self.frame_dicts.items(): + self.d = d + self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") + + if self.attr_cols: + self.build_attribs() + if self.elem_cols: + self.build_elems() + if not self.attr_cols and not self.elem_cols: + self.elem_cols = list(self.frame_dicts[0].keys()) + self.build_elems() + + self.out_xml = tostring(self.root, method="xml", encoding=self.encoding) + + if self.pretty_print: + self.out_xml = self.prettify_tree() + + if not self.xml_declaration: + self.out_xml = self.remove_declaration() + + if self.stylesheet: + warn( + "To use stylesheet, you need lxml installed. " + "Instead, the non-transformed, original XML is returned.", + UserWarning, + ) + + return self.out_xml + + def get_prefix_uri(self) -> str: + from xml.etree.ElementTree import register_namespace + + uri = "" + if self.namespaces: + for p, n in self.namespaces.items(): + register_namespace(p, n) + if self.prefix: + try: + uri = f"{{{self.namespaces[self.prefix]}}}" + except (KeyError): + raise KeyError(f"{self.prefix} is not included in namespaces") + else: + uri = f'{{{self.namespaces[""]}}}' + + return uri + + def build_attribs(self) -> None: for col in self.attr_cols: flat_col = col if isinstance(col, tuple): @@ -294,13 +344,6 @@ def build_attribs(self) -> None: raise KeyError(f"no valid column, {col}") def build_elems(self) -> None: - """ - Create child elements of row. - - This method adds child elements using elem_cols to row element and - works with tuples for multindex or hierarchical columns. - """ - from xml.etree.ElementTree import SubElement for col in self.elem_cols: @@ -342,111 +385,20 @@ def remove_declaration(self) -> None: return self.out_xml.split(b"?>")[-1].strip() - def write_output(self) -> Optional[str]: - xml_doc = self.build_tree() - - try: - if self.io: - with open(self.io, "wb") as f: - f.write(xml_doc) - xml_doc = None - else: - xml_doc = xml_doc.decode(self.encoding).rstrip() - except (UnicodeDecodeError, OSError) as e: - raise e - - return xml_doc - -class LxmlXMLFormatter: +class LxmlXMLFormatter(BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. - Parameters - ---------- - io : str or file-like - This can be either a string of raw XML, a valid URL, - file or file-like object. - - index : bool - Whether to include index in xml document. - - row_name : str - Name for root of xml document. Default is 'data'. - - root_name : str - Name for row elemens of xml document. Default is 'row'. - - na_rep : str - Missing data representation. - - attrs_cols : list - List of columns to write as attributes in row element. - - elem_cols : list - List of columns to write as children in row element. - - namespacess : dict - The namespaces to define in XML document as dicts with key - being namespace and value the URI. - - prefix : str - The prefix for each element in XML document including root. - - encoding : str - Encoding of xml object or document. - - xml_declaration : bool - Whether to include xml declaration at top line item in xml. - - pretty_print : bool - Whether to write xml document with line breaks and indentation. - - stylesheet : str or file-like - A URL, file, file-like object, or a raw string containing XSLT. - - See also - -------- - pandas.io.formats.xml.EtreeXMLFormatter - Notes ----- This class serves as default option. If user does not have `lxml` installed, `to_xml` will fall back with EtreeXMLFormatter. """ - def __init__( - self, - formatter: DataFrameFormatter, - io: Optional[FilePathOrBuffer[str]] = None, - index: Optional[bool] = True, - root_name: Optional[str] = "data", - row_name: Optional[str] = "row", - na_rep: Optional[str] = None, - attr_cols: Optional[Union[str, List[str]]] = None, - elem_cols: Optional[Union[str, List[str]]] = None, - namespaces: Optional[Dict[str, str]] = None, - prefix: Optional[str] = None, - encoding: Optional[str] = "utf-8", - xml_declaration: Optional[bool] = True, - pretty_print: Optional[bool] = True, - stylesheet: Optional[FilePathOrBuffer[str]] = None, - ) -> None: - self.fmt = formatter - self.io = io - self.index = index - self.root_name = root_name - self.row_name = row_name - self.na_rep = na_rep - self.attr_cols = attr_cols - self.elem_cols = elem_cols - self.namespaces = namespaces - self.prefix = prefix - self.encoding = encoding - self.xml_declaration = xml_declaration - self.pretty_print = pretty_print - self.stylesheet = stylesheet + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) self.validate_columns() self.validate_encoding() @@ -495,62 +447,6 @@ def build_tree(self) -> bytes: return self.out_xml - def validate_columns(self) -> None: - """ - Validate elems_cols and attrs_cols. - - This method will check if columns is list-like. - - Raises - ------ - ValueError - * If value is not a list and less then length of nodes. - """ - if self.attr_cols and not is_list_like(self.attr_cols): - raise TypeError( - f"{type(self.attr_cols).__name__} is not a valid type for attr_cols" - ) - - if self.elem_cols and not is_list_like(self.elem_cols): - raise TypeError( - f"{type(self.elem_cols).__name__} is not a valid type for elem_cols" - ) - - def validate_encoding(self) -> None: - """ - Validate encoding. - - This method will check if encoding is among listed under codecs. - - Raises - ------ - LookupError - * If encoding is not available in codecs. - """ - - try: - codecs.lookup(self.encoding) - except LookupError as e: - raise e - - def process_dataframe(self) -> dict: - """ - Adjust Data Frame to fit xml output. - - This method will adjust underlying data frame for xml output, - including replacing missing entities and including indexes. - """ - - na_dict = {"None": self.na_rep, "NaN": self.na_rep, "nan": self.na_rep} - - df = ( - (self.fmt.frame.reset_index().applymap(str).replace(na_dict)) - if self.index - else self.fmt.frame.applymap(str).replace(na_dict) - ) - - return df.to_dict(orient="index") - def convert_empty_str_key(self) -> None: """ Replace zero-lengh string in `namespaces`. @@ -562,39 +458,14 @@ def convert_empty_str_key(self) -> None: if self.namespaces and "" in self.namespaces.keys(): self.namespaces[None] = self.namespaces.pop("", "default") - def handle_indexes(self) -> None: - """ - Handle indexes. - - This method will add indexes into attr_cols or elem_cols. - """ - indexes = [x for x in self.frame_dicts[0].keys() if x not in self.orig_cols] - - if self.attr_cols and self.index: - self.attr_cols = list(indexes) + self.attr_cols - - if self.elem_cols and self.index: - self.elem_cols = list(indexes) + self.elem_cols - def get_prefix_uri(self) -> str: - """ - Get uri of namespace prefix. - - This method retrieves corresponding URI to prefix in namespaces. - - Raises - ------ - ValueError - *If prefix is not included in namespace dict. - """ - uri = "" if self.namespaces: if self.prefix: try: uri = f"{{{self.namespaces[self.prefix]}}}" except (KeyError): - raise KeyError("prefix is not included in namespaces") + raise KeyError(f"{self.prefix} is not included in namespaces") else: uri = f'{{{self.namespaces[""]}}}' @@ -656,25 +527,25 @@ def convert_io(self) -> Union[None, str]: as string, depending on object type. """ - obj = None - if isinstance(self.stylesheet, str): obj = self.stylesheet - if isinstance(self.stylesheet, bytes): + elif isinstance(self.stylesheet, bytes): obj = self.stylesheet.decode(self.encoding) - if isinstance(self.stylesheet, io.StringIO): + elif isinstance(self.stylesheet, io.StringIO): obj = self.stylesheet.getvalue() - if isinstance(self.stylesheet, io.BytesIO): + elif isinstance(self.stylesheet, io.BytesIO): obj = self.stylesheet.getvalue().decode(self.encoding) - if isinstance(self.stylesheet, io.TextIOWrapper): + elif isinstance(self.stylesheet, io.TextIOWrapper): obj = self.stylesheet.read() - if isinstance(self.stylesheet, io.BufferedReader): + elif isinstance(self.stylesheet, io.BufferedReader): obj = self.stylesheet.read().decode(self.encoding) + else: + obj = None return obj @@ -709,7 +580,7 @@ def parse_doc(self): if current_doc: is_xml = current_doc.startswith((" bytes: raise e return bytes(new_doc) - - def write_output(self) -> Optional[str]: - xml_doc = self.build_tree() - - try: - if self.io: - with open(self.io, "wb") as f: - f.write(xml_doc) - xml_doc = None - else: - xml_doc = xml_doc.decode(self.encoding).rstrip() - - except (UnicodeDecodeError, OSError) as e: - raise e - - return xml_doc diff --git a/pandas/io/xml.py b/pandas/io/xml.py index af2004c05428c..dd4736176d602 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1,6 +1,5 @@ """ -:mod:`pandas.io.xml` is a module containing functionality for dealing with -XML IO. +:mod:`pandas.io.xml` is a module for reading XML. """ @@ -10,8 +9,8 @@ from warnings import warn from pandas._typing import FilePathOrBuffer -from pandas.errors import ParserError -from pandas.util._decorators import deprecate_nonkeyword_arguments +from pandas.compat._optional import import_optional_dependency +from pandas.errors import AbstractMethodError, ParserError from pandas.core.dtypes.common import is_list_like @@ -21,10 +20,9 @@ from pandas.io.parsers import TextParser -class _EtreeFrameParser: +class _XMLFrameParser: """ - Internal class to parse XML into DataFrames with the Python - standard library XML modules: `xml.etree.ElementTree`. + Internal subclass to parse XML into DataFrames. Parameters ---------- @@ -58,15 +56,22 @@ class _EtreeFrameParser: See also -------- + pandas.io.xml._EtreeFrameParser pandas.io.xml._LxmlFrameParser Notes ----- - This class serves as fall back option if user does not have - ``lxml`` installed or user specifically requests ``etree`` parser. - """ + To subclass this class effectively you must override the following methods:` + * :func:`parse_data` + * :func:`_parse_nodes` + * :func:`_parse_doc` + * :func:`_validate_names` + * :func:`_validate_path` - from xml.etree.ElementTree import Element, ElementTree + + See each method's respective documentation for details on their + functionality. + """ def __init__( self, @@ -87,6 +92,7 @@ def __init__( self.names = names self.encoding = encoding self.stylesheet = stylesheet + self.is_style = None def parse_data(self) -> List[Dict[str, List[str]]]: """ @@ -96,19 +102,7 @@ def parse_data(self) -> List[Dict[str, List[str]]]: validate xpath, names, parse and return specific nodes. """ - if self.stylesheet: - warn( - "To use stylesheet, you need lxml installed. " - "Nodes will be parsed on original XML at the xpath.", - UserWarning, - ) - - self.xml_doc = self._parse_doc() - - self._validate_path() - self._validate_names() - - return self._parse_nodes() + raise AbstractMethodError(self) def _parse_nodes(self) -> List[Dict[str, List[str]]]: """ @@ -130,6 +124,131 @@ def _parse_nodes(self) -> List[Dict[str, List[str]]]: will have optional keys filled withi None values. """ + raise AbstractMethodError(self) + + def _validate_path(self) -> None: + """ + Validate xpath. + + This method checks for syntax, evaluation, or empty nodes return. + + Raises + ------ + SyntaxError + * If xpah is not supported or issues with namespaces. + + ValueError + * If xpah does not return any nodes. + """ + + raise AbstractMethodError(self) + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list-like and aligns + with length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + raise AbstractMethodError(self) + + def _convert_io(self, xml_data) -> Union[None, str]: + """ + Convert io object to string. + + This method will convert io object into a string or keep + as string, depending on object type. + """ + + if isinstance(xml_data, str): + obj = xml_data + + elif isinstance(xml_data, bytes): + obj = xml_data.decode(self.encoding) + + elif isinstance(xml_data, io.StringIO): + obj = xml_data.getvalue() + + elif isinstance(xml_data, io.BytesIO): + obj = xml_data.getvalue().decode(self.encoding) + + elif isinstance(xml_data, io.TextIOWrapper): + obj = xml_data.read() + + elif isinstance(xml_data, io.BufferedReader): + obj = xml_data.read().decode(self.encoding) + else: + obj = None + + return obj + + def _parse_doc(self): + """ + Build tree from io. + + This method will parse io object into tree for parsing + conditionally by its specific object type. + + Raises + ------ + HttpError + * If URL cannot be reached. + + LookupError + * If xml document has incorrect or unknown encoding. + + OSError + * If file cannot be found. + + ParseError + * If xml document conntains syntax issues. + + ValueError + * If io object is not readable as string or file-like object. + """ + + raise AbstractMethodError(self) + + +class _EtreeFrameParser(_XMLFrameParser): + """ + Internal class to parse XML into DataFrames with the Python + standard library XML modules: `xml.etree.ElementTree`. + + Notes + ----- + This class serves as fall back option if user does not have + ``lxml`` installed or user specifically requests ``etree`` parser. + """ + + from xml.etree.ElementTree import Element, ElementTree + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def parse_data(self) -> List[Dict[str, List[str]]]: + + if self.stylesheet: + warn( + "To use stylesheet, you need lxml installed. " + "Nodes will be parsed on original XML at the xpath.", + UserWarning, + ) + + self.xml_doc = self._parse_doc() + + self._validate_path() + self._validate_names() + + return self._parse_nodes() + + def _parse_nodes(self) -> List[Dict[str, List[str]]]: + elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) if self.elems_only and self.attrs_only: @@ -215,15 +334,6 @@ def _parse_nodes(self) -> List[Dict[str, List[str]]]: def _validate_path(self) -> None: """ - Validate xpath. - - This method checks for sytnax, evaluation, or empty nodes return. - - Raises - ------ - SyntaxError - * If xpah is not supported or issues with namespaces. - Notes ----- `etree` supports limited XPath. If user attempts a more complex @@ -252,17 +362,6 @@ def _validate_path(self) -> None: ) def _validate_names(self) -> None: - """ - Validate names. - - This method will check if names is a list-like and aligns - with length of parse nodes. - - Raises - ------ - ValueError - * If value is not a list and less then length of nodes. - """ if self.names: children = self.xml_doc.find( self.xpath, namespaces=self.namespaces @@ -278,65 +377,14 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _convert_io(self) -> Union[None, str]: - """ - Convert io object to string. - - This method will convert io object into a string or keep - as string, depending on object type. - """ - - obj = None - - if isinstance(self.io, str): - obj = self.io - - if isinstance(self.io, bytes): - obj = self.io.decode(self.encoding) - - if isinstance(self.io, io.StringIO): - obj = self.io.getvalue() - - if isinstance(self.io, io.BytesIO): - obj = self.io.getvalue().decode(self.encoding) - - if isinstance(self.io, io.TextIOWrapper): - obj = self.io.read() - - if isinstance(self.io, io.BufferedReader): - obj = self.io.read().decode(self.encoding) - - return obj - def _parse_doc(self) -> Union[Element, ElementTree]: - """ - Build tree from io. - - This method will parse io object into tree for parsing - conditionally by its specific object type. - - Raises - ------ - HttpError - * If URL cannot be reached. - - OSError - * If file cannot be found. - - ParseError - * If xml document conntains syntax issues. - - ValueError - * If io object is not readable as string or file-like object. - """ - from xml.etree.ElementTree import ParseError, fromstring, parse - current_doc = self._convert_io() + current_doc = self._convert_io(self.io) if current_doc: is_xml = current_doc.startswith((" Union[Element, ElementTree]: return r -class _LxmlFrameParser: +class _LxmlFrameParser(_XMLFrameParser): """ Internal class to parse XML into DataFrames with third-party full-featured XML library, `lxml`, that supports XPath 1.0 and XSLT 1.0. - Parameters - ---------- - io : str or file-like - This can be either a string of raw XML, a valid URL, - file or file-like object. - - xpath : str or regex - The XPath expression to parse required set of nodes for - migration to `Data Frame`. - - namespacess : dict - The namespaces defined in XML document (`xmlns:namespace='URI') - as dicts with key being namespace and value the URI. - - elems_only : bool - Parse only the child elements at the specified `xpath`. - - attrs_only : bool - Parse only the attributes at the specified `xpath`. - - names : list - Column names for Data Frame of parsed XML data. - - encoding : str - Encoding of xml object or document. - - stylesheet : str or file-like - URL, file, file-like object, or a raw string containing XSLT. - - See also - -------- - pandas.io.xml._EtreeFrameParser - Notes ----- This is the default class called with `_EtreeFrameParser` serving @@ -405,28 +420,8 @@ class _LxmlFrameParser: efficiency. """ - def __init__( - self, - io, - xpath, - namespaces, - elems_only, - attrs_only, - names, - encoding, - stylesheet, - ): - self.io = io - self.xpath = xpath - self.namespaces = namespaces - self.elems_only = elems_only - self.attrs_only = attrs_only - self.names = names - self.encoding = encoding - self.stylesheet = stylesheet - self.is_style = False - - self.compression = "infer" + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def parse_data(self) -> List[Dict[str, List[str]]]: """ @@ -450,24 +445,6 @@ def parse_data(self) -> List[Dict[str, List[str]]]: return self._parse_nodes() def _parse_nodes(self) -> List[Dict[str, List[str]]]: - """ - Parse xml nodes. - - This method will parse the children and attributes of elements - in xpath, conditionally for only elements, only attributes - or both while optionally renaming node names. - - Raises - ------ - ValueError - * If only elements and only attributes are specified. - - Notes - ----- - Namespace URIs will be removed from return node values.Also, - elements with missing children or attributes compared to siblings - will have optional keys filled withi None values. - """ elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) if self.elems_only and self.attrs_only: @@ -570,21 +547,6 @@ def _transform_doc(self): return new_doc def _validate_path(self) -> None: - """ - Validate xpath. - - This method checks for sytnax, evaluation, or empty nodes return. - - Raises - ------ - SyntaxError - * If xpah is not supported or issues with namespaces. - - Notes - ----- - `etree` supports limited XPath. If user attempts a more complex - expression syntax error will raise. - """ from lxml.etree import XPathEvalError, XPathSyntaxError try: @@ -632,70 +594,16 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _convert_io(self) -> Union[None, str]: - """ - Convert filepath_or_buffer object to string. - - This method will convert io object into a string or keep - as string, depending on object type. - """ - - obj = None - - if isinstance(self.raw_doc, str): - obj = self.raw_doc - - if isinstance(self.raw_doc, bytes): - obj = self.raw_doc.decode(self.encoding) - - if isinstance(self.raw_doc, io.StringIO): - obj = self.raw_doc.getvalue() - - if isinstance(self.raw_doc, io.BytesIO): - obj = self.raw_doc.getvalue().decode(self.encoding) - - if isinstance(self.raw_doc, io.TextIOWrapper): - obj = self.raw_doc.read() - - if isinstance(self.raw_doc, io.BufferedReader): - obj = self.raw_doc.read().decode(self.encoding) - - return obj - def _parse_doc(self): - """ - Build tree from io. - - This method will parse io object into tree for parsing - conditionally by its specific object type. - - Raises - ------ - HttpError - * If URL cannot be reached. - - LookupError - * If xml document has incorrect or unknown encoding. - - OSError - * If file cannot be found. - - XMLSyntaxError - * If xml document conntains syntax issues. - - ValueError - * If io object is not readable as string or file-like object. - """ - from lxml.etree import XML, XMLParser, XMLSyntaxError, parse self.raw_doc = self.stylesheet if self.is_style else self.io - current_doc = self._convert_io() + current_doc = self._convert_io(self.raw_doc) if current_doc: is_xml = current_doc.startswith((" DataFrame: r""" - Read XML docuemnts into a ``DataFrame`` object. + Read XML document into a ``DataFrame`` object. .. versionadded:: 1.3.0 diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 62958894981fd..5234f25399fef 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -1,4 +1,6 @@ from io import BytesIO, StringIO +import os +import sys import numpy as np import pytest @@ -10,6 +12,44 @@ from pandas.io.xml import read_xml +""" +CHECKLIST + +etree +[X] - TypeError("...is not a valid type for attr_cols") +[X] - TypeError("...is not a valid type for elem_cols") +[X] - LookupError("unknown encoding") +[X] - KeyError("...is not included in namespaces") +[X] - KeyError("no valid column") +[X] - UserWarning("To use stylesheet, you need lxml installed.") +[X] - ImportWarning("You do not have lxml installed.") + +lxml +[X] - TypeError("...is not a valid type for attr_cols") +[X] - TypeError("...is not a valid type for elem_cols") +[X] - LookupError("unknown encoding") +[] - UnicodeDecodeError (NEED TO NON UTF-8 STYLESHEET) +[] - OSError (NEED UNREACHABLE FILE PATH) +[X] - KeyError("...is not included in namespaces") +[X] - KeyError("no valid column") +[X] - ValueError("stylesheet is not a url, file, or xml string.") +[] - LookupError +[] - URLError (GENERAL ERROR USUALLY DUE TO NETWORKING) +[] - HTTPError (NEED TO ONLINE STYLESHEET) +[X] - OSError("failed to load external entity") +[X] - XMLSyntaxError("Opening and ending tag mismatch") +[X] - XSLTApplyError("Cannot resolve URI") +[X] - XSLTParseError("failed to compile") +""" + +etree_attr_skip_param = pytest.param( + "etree", + marks=pytest.mark.skipif( + sys.version_info <= (3, 7), + reason=("etree alpha ordered attributes <= py3.7"), + ), +) + geom_df = DataFrame( { "shape": ["square", "circle", "triangle"], @@ -93,7 +133,28 @@ """ -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +@pytest.fixture(params=["rb", "r"]) +def mode(request): + return request.param + + +@pytest.fixture(params=["lxml", "etree"]) +def parser(request): + return request.param + + +# FAIL SAFE WARNING + + +@td.skip_if_installed("lxml") +def test_failsafe_parser(datapath): + with pytest.warns(ImportWarning, match=("You do not have lxml installed.")): + geom_df.to_xml() + + +# FILE OUTPUT + + def test_file_output_str_read(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) @@ -112,7 +173,6 @@ def test_file_output_str_read(datapath, parser): assert output == from_file_expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_file_output_bytes_read(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) @@ -131,7 +191,6 @@ def test_file_output_bytes_read(datapath, parser): assert output == from_file_expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_str_output(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) @@ -147,7 +206,9 @@ def test_str_output(datapath, parser): assert output == from_file_expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# INDEX + + def test_index_false(datapath, parser): expected = """\ @@ -192,7 +253,6 @@ def test_index_false(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_index_false_rename_row_root(datapath, parser): expected = """\ @@ -239,6 +299,8 @@ def test_index_false_rename_row_root(datapath, parser): assert output == expected +# NA_REP + na_expected = """\ @@ -263,7 +325,6 @@ def test_index_false_rename_row_root(datapath, parser): """ -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_na_elem_output(datapath, parser): output = geom_df.to_xml(parser=parser) @@ -276,7 +337,6 @@ def test_na_elem_output(datapath, parser): assert output == na_expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_na_empty_str_elem_option(datapath, parser): output = geom_df.to_xml(na_rep="", parser=parser) @@ -289,7 +349,6 @@ def test_na_empty_str_elem_option(datapath, parser): assert output == na_expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_na_empty_elem_option(datapath, parser): expected = """\ @@ -325,7 +384,10 @@ def test_na_empty_elem_option(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# ATTR_COLS + + +@pytest.mark.parametrize("parser", ["lxml", etree_attr_skip_param]) def test_attrs_cols_nan_output(datapath, parser): expected = """\ @@ -346,14 +408,17 @@ def test_attrs_cols_nan_output(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +@pytest.mark.parametrize("parser", ["lxml", etree_attr_skip_param]) def test_attrs_cols_prefix(datapath, parser): expected = """\ - - - + + + """ output = geom_df.to_xml( @@ -372,19 +437,19 @@ def test_attrs_cols_prefix(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_attrs_unknown_column(parser): with pytest.raises(KeyError, match=("no valid column")): geom_df.to_xml(attr_cols=["shape", "degreees", "sides"], parser=parser) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_attrs_wrong_type(parser): with pytest.raises(TypeError, match=("is not a valid type for attr_cols")): geom_df.to_xml(attr_cols='"shape", "degreees", "sides"', parser=parser) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# ELEM_COLS + + def test_elems_cols_nan_output(datapath, parser): elems_cols_expected = """\ @@ -419,19 +484,16 @@ def test_elems_cols_nan_output(datapath, parser): assert output == elems_cols_expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_elems_unknown_column(parser): with pytest.raises(KeyError, match=("no valid column")): geom_df.to_xml(elem_cols=["shape", "degreees", "sides"], parser=parser) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_elems_wrong_type(parser): with pytest.raises(TypeError, match=("is not a valid type for elem_cols")): geom_df.to_xml(elem_cols='"shape", "degreees", "sides"', parser=parser) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_elems_and_attrs_cols(datapath, parser): elems_cols_expected = """\ @@ -466,7 +528,9 @@ def test_elems_and_attrs_cols(datapath, parser): assert output == elems_cols_expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# HIERARCHICAL COLUMNS + + def test_hierarchical_columns(datapath, parser): expected = """\ @@ -475,29 +539,29 @@ def test_hierarchical_columns(datapath, parser): inner terrestrial 4 - 11.811666 - 2.9529165 + 11.81 + 2.95 outer gas giant 2 - 2466.5044 - 1233.2522 + 2466.5 + 1233.25 outer ice giant 2 - 189.2253 - 94.61265 + 189.23 + 94.61 All 8 - 2667.541366 - 333.44267075 + 2667.54 + 333.44 """ @@ -506,7 +570,7 @@ def test_hierarchical_columns(datapath, parser): values="mass", aggfunc=["count", "sum", "mean"], margins=True, - ) + ).round(2) output = pvt.to_xml(parser=parser) @@ -519,19 +583,19 @@ def test_hierarchical_columns(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +@pytest.mark.parametrize("parser", ["lxml", etree_attr_skip_param]) def test_hierarchical_attrs_columns(datapath, parser): expected = """\ +sum_mass="11.81" mean_mass="2.95"/> +sum_mass="2466.5" mean_mass="1233.25"/> +sum_mass="189.23" mean_mass="94.61"/> +sum_mass="2667.54" mean_mass="333.44"/> """ pvt = planet_df.pivot_table( @@ -539,7 +603,7 @@ def test_hierarchical_attrs_columns(datapath, parser): values="mass", aggfunc=["count", "sum", "mean"], margins=True, - ) + ).round(2) output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser) @@ -552,7 +616,9 @@ def test_hierarchical_attrs_columns(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# MULTIINDEX + + def test_multi_index(datapath, parser): expected = """\ @@ -561,26 +627,30 @@ def test_multi_index(datapath, parser): inner terrestrial 4 - 11.811666 - 2.9529165 + 11.81 + 2.95 outer gas giant 2 - 2466.5044 - 1233.2522 + 2466.5 + 1233.25 outer ice giant 2 - 189.2253 - 94.61265 + 189.23 + 94.61 """ - agg = planet_df.groupby(["location", "type"])["mass"].agg(["count", "sum", "mean"]) + agg = ( + planet_df.groupby(["location", "type"])["mass"] + .agg(["count", "sum", "mean"]) + .round(2) + ) output = agg.to_xml(parser=parser) @@ -593,18 +663,24 @@ def test_multi_index(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +@pytest.mark.parametrize("parser", ["lxml", etree_attr_skip_param]) def test_multi_index_attrs_cols(datapath, parser): expected = """\ - - - + + + """ - agg = planet_df.groupby(["location", "type"])["mass"].agg(["count", "sum", "mean"]) - + agg = ( + planet_df.groupby(["location", "type"])["mass"] + .agg(["count", "sum", "mean"]) + .round(2) + ) output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser) # etree and lxml differs on quotes and case in xml declaration @@ -616,7 +692,9 @@ def test_multi_index_attrs_cols(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# NAMESPACE + + def test_default_namespace(parser): expected = """\ @@ -652,7 +730,9 @@ def test_default_namespace(parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# PREFIX + + def test_namespace_prefix(parser): expected = """\ @@ -690,16 +770,14 @@ def test_namespace_prefix(parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_missing_prefix_in_nmsp(parser): - with pytest.raises(KeyError, match=("prefix is not included in namespaces")): + with pytest.raises(KeyError, match=("doc is not included in namespaces")): geom_df.to_xml( namespaces={"": "http://example.com"}, prefix="doc", parser=parser ) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_namespace_prefix_and_default(parser): expected = """\ @@ -745,6 +823,8 @@ def test_namespace_prefix_and_default(parser): assert output == expected +# ENCODING + encoding_expected = """\ @@ -781,7 +861,6 @@ def test_namespace_prefix_and_default(parser): """ -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_encoding_option_str(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") df_file = read_xml(filename, parser=parser, encoding="ISO-8859-1").head(5) @@ -806,7 +885,6 @@ def test_correct_encoding_file(datapath): df_file.to_xml(path, index=False, encoding="ISO-8859-1") -@pytest.mark.parametrize("parser", ["lxml", "etree"]) @pytest.mark.parametrize("encoding", ["UTF-8", "UTF-16", "ISO-8859-1"]) def test_wrong_encoding_option_lxml(datapath, parser, encoding): filename = datapath("io", "data", "xml", "baby_names.xml") @@ -816,13 +894,14 @@ def test_wrong_encoding_option_lxml(datapath, parser, encoding): df_file.to_xml(path, index=False, encoding=encoding, parser=parser) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_misspelled_encoding(parser): with pytest.raises(LookupError, match=("unknown encoding")): geom_df.to_xml(parser=parser, encoding="uft-8") -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# PRETTY PRINT + + def test_xml_declaration_pretty_print(parser): expected = """\ @@ -851,7 +930,6 @@ def test_xml_declaration_pretty_print(parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_no_pretty_print_with_decl(parser): expected = ( "\n" @@ -873,7 +951,6 @@ def test_no_pretty_print_with_decl(parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_no_pretty_print_no_decl(parser): expected = ( "0square" @@ -889,6 +966,8 @@ def test_no_pretty_print_no_decl(parser): assert output == expected +# STYLESHEET + xsl_expected = """\ @@ -914,7 +993,6 @@ def test_no_pretty_print_no_decl(parser): @td.skip_if_no("lxml") -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_stylesheet_file_like(datapath, mode): xsl = datapath("io", "data", "xml", "row_field_output.xsl") @@ -923,7 +1001,6 @@ def test_stylesheet_file_like(datapath, mode): @td.skip_if_no("lxml") -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_stylesheet_io(datapath, mode): xsl = datapath("io", "data", "xml", "row_field_output.xsl") @@ -938,7 +1015,6 @@ def test_stylesheet_io(datapath, mode): @td.skip_if_no("lxml") -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_stylesheet_buffered_reader(datapath, mode): xsl = datapath("io", "data", "xml", "row_field_output.xsl") @@ -950,6 +1026,119 @@ def test_stylesheet_buffered_reader(datapath, mode): assert output == xsl_expected +def test_stylesheet_with_etree(datapath): + xsl = datapath("io", "data", "xml", "row_field_output.xsl") + + with pytest.warns( + UserWarning, match=("To use stylesheet, you need lxml installed.") + ): + geom_df.to_xml(parser="etree", stylesheet=xsl) + + +@td.skip_if_installed("lxml") +def test_stylesheet_without_lxml(datapath, parser): + xsl = datapath("io", "data", "xml", "row_field_output.xslt") + + with pytest.warns( + UserWarning, match=("To use stylesheet, you need lxml installed.") + ): + geom_df.to_xml(stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_stylesheet_wrong_path(datapath, parser): + xsl = os.path.join("data", "xml", "row_field_output.xslt") + + with pytest.raises(OSError, match=("failed to load external entity")): + geom_df.to_xml(stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_stylesheet_not_path_buffer(parser): + with pytest.raises( + ValueError, match=("stylesheet is not a url, file, or xml string") + ): + geom_df.to_xml(stylesheet=DataFrame) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_syntax(): + from lxml.etree import XMLSyntaxError + + xsl = """\ + + + + + + + + + + + + + + + + + + +""" + + with pytest.raises(XMLSyntaxError, match=("Opening and ending tag mismatch")): + geom_df.to_xml(stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_eval(): + from lxml.etree import XSLTParseError + + xsl = """\ + + + + + + + + + + + + + + + + + + +""" + + with pytest.raises(XSLTParseError, match=("failed to compile")): + geom_df.to_xml(stylesheet=xsl) + + +def test_incorrect_xsl_apply(parser): + from lxml.etree import XSLTApplyError + + xsl = """\ + + + + + + + + + +""" + + with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")): + with tm.ensure_clean("test.xml") as path: + geom_df.to_xml(path, stylesheet=xsl) + + @td.skip_if_no("lxml") def test_style_to_csv(): xsl = """\ @@ -970,7 +1159,7 @@ def test_style_to_csv(): """ - out_csv = geom_df.to_csv().strip() + out_csv = geom_df.to_csv(line_terminator="\n").strip() out_xml = geom_df.to_xml(stylesheet=xsl) assert out_csv == out_xml @@ -1053,47 +1242,3 @@ def test_style_to_json(): out_xml = geom_df.to_xml(stylesheet=xsl) assert out_json == out_xml - - -@pytest.mark.skip( - reason="incorrect tag in from to_html() to be skipped until fix" -) -def test_style_to_html(): - xsl = """\ - - - - - - - - - - - - - - - - - -
shapedegreessides
-
- - - - - - - - - - - - -
""" - - out_html = geom_df.to_html() - out_xml = geom_df.to_xml(stylesheet=xsl) - - assert out_html == out_xml diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 375e9c2472742..23eb128a30379 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -11,6 +11,43 @@ from pandas.io.xml import read_xml +""" +CHECK LIST + +etree +[X] - ValueError("Either element or attributes can be parsed not both.") +[X] - ValueError("xpath does not return any nodes...") +[X] - SyntaxError("You have used an incorrect or unsupported XPath") +[X] - ValueError("names does not match length of child elements in xpath.") +[X] - TypeError("...is not a valid type for names") +[X] - ValueError("io is not a url, file, or xml string") +[] - URLError (GENERAL ERROR USUALLY DUE TO NETWORKING) +[X] - HTTPError("HTTP Error 404: Not Found") +[X] - OSError("No such file") +[] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) +[X] - ImportWarning("You do not have lxml installed.") + +lxml +[X] - ValueError("Either element or attributes can be parsed not both.") +[X] - XSLTApplyError("Cannot resolve URI") +[X] - XSLTParseError("document is not a stylesheet") +[X] - ValueError("xpath does not return any nodes.") +[X] - XPathEvalError("Invalid expression") +[] - XPathSyntaxError (OLD VERSION IN lxml FOR XPATH ERRORS) +[X] - TypeError("empty namespace prefix is not supported in XPath") +[X] - ValueError("names does not match length of child elements in xpath.") +[X] - TypeError("...is not a valid type for names") +[X] - ValueError("io is not a url, file, or xml string") +[X] - LookupError(unknown encoding) +[] - URLError (GENERAL ERROR USUALLY DUE TO NETWORKING) +[X - HTTPError("HTTP Error 404: Not Found") +[X] - OSError("failed to load external entity") +[X] - XMLSyntaxError("Start tag expected, '<' not found") +[] - ParserError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) +[X] - ValueError("Values for parser can only be lxml or etree.") +""" + + xml_default_nmsp = """\ @@ -52,6 +89,30 @@ """ +@pytest.fixture(params=["rb", "r"]) +def mode(request): + return request.param + + +@pytest.fixture(params=["lxml", "etree"]) +def parser(request): + return request.param + + +# FAIL SAFE WARNING + + +@td.skip_if_installed("lxml") +def test_failsafe_parser(datapath): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.warns(ImportWarning, match=("You do not have lxml installed.")): + read_xml(filename) + + +# FILE / URL + + def test_parser_consistency_file(datapath): filename = datapath("io", "data", "xml", "books.xml") df_file_lxml = read_xml(filename, parser="lxml") @@ -67,14 +128,12 @@ def test_parser_consistency_url(datapath): "https://data.cityofchicago.org/api/views/" "8pix-ypme/rows.xml?accessType=DOWNLOAD" ) - df_file_lxml = read_xml(url, xpath=".//row/row", parser="lxml") - df_file_etree = read_xml(url, xpath=".//row/row", parser="etree") + df_url_lxml = read_xml(url, xpath=".//row/row", parser="lxml") + df_url_etree = read_xml(url, xpath=".//row/row", parser="etree") - tm.assert_frame_equal(df_file_lxml, df_file_etree) + tm.assert_frame_equal(df_url_lxml, df_url_etree) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_file_like(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with open(filename, mode) as f: @@ -93,8 +152,6 @@ def test_file_like(datapath, parser, mode): tm.assert_frame_equal(df_file, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_file_io(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with open(filename, mode) as f: @@ -118,8 +175,6 @@ def test_file_io(datapath, parser, mode): tm.assert_frame_equal(df_io, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_file_buffered_reader_string(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with open(filename, mode) as f: @@ -140,8 +195,6 @@ def test_file_buffered_reader_string(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with open(filename, mode) as f: @@ -163,6 +216,11 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) +def test_not_io_object(parser): + with pytest.raises(ValueError, match=("io is not a url, file, or xml string")): + read_xml(DataFrame, parser="lxml") + + def test_wrong_file_lxml(datapath): with pytest.raises(OSError, match=("failed to load external entity")): filename = os.path.join("data", "html", "books.xml") @@ -195,14 +253,16 @@ def test_url(): tm.assert_frame_equal(df_url, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_wrong_url(parser): with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")): url = "https://www.w3schools.com/xml/python.xml" read_xml(url, xpath=".//book[count(*)=4]", parser=parser) -def test_empty_xpath_lxml(datapath): +# XPATH + + +def test_empty_xpath_lxml(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(ValueError, match=("xpath does not return any nodes")): read_xml(filename, xpath=".//python", parser="lxml") @@ -225,7 +285,9 @@ def test_bad_xpath_lxml(datapath): read_xml(filename, xpath=".//[book]", parser="lxml") -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# NAMESPACE + + def test_default_namespace(parser): df_nmsp = read_xml( xml_default_nmsp, @@ -245,7 +307,6 @@ def test_default_namespace(parser): tm.assert_frame_equal(df_nmsp, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_prefix_namespace(parser): df_nmsp = read_xml( xml_prefix_nmsp, @@ -301,7 +362,9 @@ def test_consistency_prefix_namespace(): tm.assert_frame_equal(df_lxml, df_etree) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# PREFIX + + def test_missing_prefix_with_default_namespace(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(ValueError, match=("xpath does not return any nodes")): @@ -337,7 +400,9 @@ def test_none_namespace_prefix(key): ) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +# ELEMS AND ATTRS + + def test_file_elems_and_attrs(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) @@ -354,7 +419,6 @@ def test_file_elems_and_attrs(datapath, parser): tm.assert_frame_equal(df_file, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_file_only_attrs(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, attrs_only=True, parser=parser) @@ -363,7 +427,6 @@ def test_file_only_attrs(datapath, parser): tm.assert_frame_equal(df_file, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_file_only_elems(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, elems_only=True, parser=parser) @@ -379,7 +442,6 @@ def test_file_only_elems(datapath, parser): tm.assert_frame_equal(df_file, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_elem_and_attrs_only(datapath, parser): filename = datapath("io", "data", "xml", "cta_rail_lines.kml") with pytest.raises( @@ -389,7 +451,36 @@ def test_elem_and_attrs_only(datapath, parser): read_xml(filename, elems_only=True, attrs_only=True, parser=parser) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) +def test_attribute_centric_xml(): + xml = """\ + + + + + + + + + + + + + + + + + +""" + + df_lxml = read_xml(xml, xpath=".//station") + df_etree = read_xml(xml, xpath=".//station", parser="etree") + + tm.assert_frame_equal(df_lxml, df_etree) + + +# NAMES + + def test_names_option_output(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml( @@ -409,7 +500,6 @@ def test_names_option_output(datapath, parser): tm.assert_frame_equal(df_file, df_expected) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_names_option_wrong_length(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") @@ -417,7 +507,6 @@ def test_names_option_wrong_length(datapath, parser): read_xml(filename, names=["Col1", "Col2", "Col3"], parser=parser) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_names_option_wrong_type(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") @@ -425,6 +514,9 @@ def test_names_option_wrong_type(datapath, parser): read_xml(filename, names="Col1, Col2, Col3", parser=parser) +# ENCODING + + @td.skip_if_no("lxml") def test_wrong_encoding_lxml(datapath): from lxml.etree import XMLSyntaxError @@ -457,7 +549,6 @@ def test_wrong_encoding_etree(datapath, encoding): read_xml(filename, parser="etree", encoding=encoding) -@pytest.mark.parametrize("parser", ["lxml", "etree"]) def test_ascii_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") read_xml(filename, encoding="ascii", parser=parser) @@ -471,31 +562,7 @@ def test_parser_consistency_with_encoding(datapath): tm.assert_frame_equal(df_lxml, df_etree) -def test_attribute_centric_xml(): - xml = """\ - - - - - - - - - - - - - - - - - -""" - - df_lxml = read_xml(xml, xpath=".//station") - df_etree = read_xml(xml, xpath=".//station", parser="etree") - - tm.assert_frame_equal(df_lxml, df_etree) +# PARSER def test_wrong_parser(datapath): @@ -507,6 +574,9 @@ def test_wrong_parser(datapath): read_xml(filename, parser="bs4") +# STYLESHEET + + @td.skip_if_no("lxml") def test_stylesheet_file(datapath): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") @@ -630,7 +700,6 @@ def test_stylesheet_file(datapath): @td.skip_if_no("lxml") -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_stylesheet_file_like(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") @@ -640,7 +709,6 @@ def test_stylesheet_file_like(datapath, mode): @td.skip_if_no("lxml") -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_stylesheet_io(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") @@ -653,7 +721,6 @@ def test_stylesheet_io(datapath, mode): @td.skip_if_no("lxml") -@pytest.mark.parametrize("mode", ["rb", "r"]) def test_stylesheet_buffered_reader(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") @@ -664,8 +731,102 @@ def test_stylesheet_buffered_reader(datapath, mode): read_xml(kml, stylesheet=xsl_obj) +def test_not_stylesheet(datapath): + from lxml.etree import XSLTParseError + + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "books.xml") + + with pytest.raises(XSLTParseError, match=("document is not a stylesheet")): + read_xml(kml, stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_syntax(datapath): + from lxml.etree import XMLSyntaxError + + xsl = """\ + + + + + + + + + + + + + + + +""" + + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + with pytest.raises( + XMLSyntaxError, match=("Extra content at the end of the document") + ): + read_xml(kml, stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_eval(datapath): + from lxml.etree import XSLTParseError + + xsl = """\ + + + + + + + + + + + + + + + +""" + + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + with pytest.raises(XSLTParseError, match=("failed to compile")): + read_xml(kml, stylesheet=xsl) + + +@td.skip_if_no("lxml") +def test_incorrect_xsl_apply(datapath): + from lxml.etree import XSLTApplyError + + xsl = """\ + + + + + + + + + +""" + + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")): + read_xml(kml, stylesheet=xsl) + + @td.skip_if_no("lxml") -def test_wrong_stylesheet(datapath): +def test_wrong_stylesheet(): kml = os.path.join("data", "xml", "cta_rail_lines.kml") xsl = os.path.join("data", "xml", "flatten.xsl") From fadcb679121cbfd065bd22d50495f813e035efa1 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 2 Feb 2021 23:17:06 -0600 Subject: [PATCH 03/35] Fixed import_optional_dependency() args --- pandas/io/formats/format.py | 4 +--- pandas/io/xml.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ebf3cf0852575..870e2a5976319 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1085,9 +1085,7 @@ def to_xml( from pandas.io.formats.xml import EtreeXMLFormatter, LxmlXMLFormatter - lxml = import_optional_dependency( - "lxml.etree", raise_on_missing=False, on_version="ignore" - ) + lxml = import_optional_dependency("lxml.etree", errors="ignore") if parser == "lxml": if lxml is not None: diff --git a/pandas/io/xml.py b/pandas/io/xml.py index dd4736176d602..0302c5f287e94 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -674,9 +674,7 @@ def _parse( fallback option with etree parser. """ - lxml = import_optional_dependency( - "lxml.etree", raise_on_missing=False, on_version="ignore" - ) + lxml = import_optional_dependency("lxml.etree", errors="ignore") if parser == "lxml": if lxml is not None: From ac5fd3a861b4032c8190ad733b7063e3b6664249 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 3 Feb 2021 01:05:10 -0600 Subject: [PATCH 04/35] Fix fixture and param name collision and check two errors in tests --- pandas/tests/io/formats/test_to_xml.py | 23 +++++++++++++---------- pandas/tests/io/test_xml.py | 15 +++++++++++++-- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 5234f25399fef..412c07dfd7960 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -387,8 +387,8 @@ def test_na_empty_elem_option(datapath, parser): # ATTR_COLS -@pytest.mark.parametrize("parser", ["lxml", etree_attr_skip_param]) -def test_attrs_cols_nan_output(datapath, parser): +@pytest.mark.parametrize("attrs_parser", ["lxml", etree_attr_skip_param]) +def test_attrs_cols_nan_output(datapath, attrs_parser): expected = """\ @@ -408,8 +408,8 @@ def test_attrs_cols_nan_output(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", etree_attr_skip_param]) -def test_attrs_cols_prefix(datapath, parser): +@pytest.mark.parametrize("attrs_parser", ["lxml", etree_attr_skip_param]) +def test_attrs_cols_prefix(datapath, attrs_parser): expected = """\ @@ -583,8 +583,8 @@ def test_hierarchical_columns(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", etree_attr_skip_param]) -def test_hierarchical_attrs_columns(datapath, parser): +@pytest.mark.parametrize("attrs_parser", ["lxml", etree_attr_skip_param]) +def test_hierarchical_attrs_columns(datapath, attrs_parser): expected = """\ @@ -663,8 +663,8 @@ def test_multi_index(datapath, parser): assert output == expected -@pytest.mark.parametrize("parser", ["lxml", etree_attr_skip_param]) -def test_multi_index_attrs_cols(datapath, parser): +@pytest.mark.parametrize("attrs_parser", ["lxml", etree_attr_skip_param]) +def test_multi_index_attrs_cols(datapath, attrs_parser): expected = """\ @@ -1037,7 +1037,7 @@ def test_stylesheet_with_etree(datapath): @td.skip_if_installed("lxml") def test_stylesheet_without_lxml(datapath, parser): - xsl = datapath("io", "data", "xml", "row_field_output.xslt") + xsl = datapath("io", "data", "xml", "row_field_output.xsl") with pytest.warns( UserWarning, match=("To use stylesheet, you need lxml installed.") @@ -1049,7 +1049,10 @@ def test_stylesheet_without_lxml(datapath, parser): def test_stylesheet_wrong_path(datapath, parser): xsl = os.path.join("data", "xml", "row_field_output.xslt") - with pytest.raises(OSError, match=("failed to load external entity")): + with pytest.raises( + (OSError, FileNotFoundError), + match=("failed to load external entity|No such file or directory"), + ): geom_df.to_xml(stylesheet=xsl) diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 23eb128a30379..53926765f00d8 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -113,6 +113,7 @@ def test_failsafe_parser(datapath): # FILE / URL +@td.skip_if_no("lxml") def test_parser_consistency_file(datapath): filename = datapath("io", "data", "xml", "books.xml") df_file_lxml = read_xml(filename, parser="lxml") @@ -123,6 +124,7 @@ def test_parser_consistency_file(datapath): @tm.network @pytest.mark.slow +@td.skip_if_no("lxml") def test_parser_consistency_url(datapath): url = ( "https://data.cityofchicago.org/api/views/" @@ -216,13 +218,18 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) +@td.skip_if_no("lxml") def test_not_io_object(parser): with pytest.raises(ValueError, match=("io is not a url, file, or xml string")): read_xml(DataFrame, parser="lxml") +@td.skip_if_no("lxml") def test_wrong_file_lxml(datapath): - with pytest.raises(OSError, match=("failed to load external entity")): + with pytest.raises( + (OSError, FileNotFoundError), + match=("failed to load external entity|No such file or directory"), + ): filename = os.path.join("data", "html", "books.xml") read_xml(filename, parser="lxml") @@ -731,6 +738,7 @@ def test_stylesheet_buffered_reader(datapath, mode): read_xml(kml, stylesheet=xsl_obj) +@td.skip_if_no("lxml") def test_not_stylesheet(datapath): from lxml.etree import XSLTParseError @@ -830,7 +838,10 @@ def test_wrong_stylesheet(): kml = os.path.join("data", "xml", "cta_rail_lines.kml") xsl = os.path.join("data", "xml", "flatten.xsl") - with pytest.raises(OSError, match=("failed to load external entity")): + with pytest.raises( + (OSError, FileNotFoundError), + match=("failed to load external entity|No such file or directory"), + ): read_xml(kml, stylesheet=xsl) From 938b0a091c8e4d4fe65ce9099f3c208a5798215c Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 3 Feb 2021 09:31:32 -0600 Subject: [PATCH 05/35] Adjusted tests to handle etree version issues --- pandas/tests/io/formats/test_to_xml.py | 57 ++++++++++++++------------ pandas/tests/io/test_xml.py | 10 ++--- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 412c07dfd7960..4aadbdb4b32e4 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -28,28 +28,20 @@ [X] - TypeError("...is not a valid type for attr_cols") [X] - TypeError("...is not a valid type for elem_cols") [X] - LookupError("unknown encoding") -[] - UnicodeDecodeError (NEED TO NON UTF-8 STYLESHEET) -[] - OSError (NEED UNREACHABLE FILE PATH) +[] - UnicodeDecodeError (NEED NON-UTF-8 STYLESHEET) +[] - OSError (NEED UNREACHABLE LOCAL FILE PATH) [X] - KeyError("...is not included in namespaces") [X] - KeyError("no valid column") [X] - ValueError("stylesheet is not a url, file, or xml string.") [] - LookupError -[] - URLError (GENERAL ERROR USUALLY DUE TO NETWORKING) -[] - HTTPError (NEED TO ONLINE STYLESHEET) +[] - URLError (USUALLY DUE TO NETWORKING) +[] - HTTPError (NEED AN ONLINE STYLESHEET) [X] - OSError("failed to load external entity") [X] - XMLSyntaxError("Opening and ending tag mismatch") [X] - XSLTApplyError("Cannot resolve URI") [X] - XSLTParseError("failed to compile") """ -etree_attr_skip_param = pytest.param( - "etree", - marks=pytest.mark.skipif( - sys.version_info <= (3, 7), - reason=("etree alpha ordered attributes <= py3.7"), - ), -) - geom_df = DataFrame( { "shape": ["square", "circle", "triangle"], @@ -387,8 +379,11 @@ def test_na_empty_elem_option(datapath, parser): # ATTR_COLS -@pytest.mark.parametrize("attrs_parser", ["lxml", etree_attr_skip_param]) -def test_attrs_cols_nan_output(datapath, attrs_parser): +@pytest.mark.skipif( + sys.version_info <= (3, 7), + reason=("etree alpha ordered attributes <= py3.7"), +) +def test_attrs_cols_nan_output(datapath, parser): expected = """\ @@ -408,8 +403,11 @@ def test_attrs_cols_nan_output(datapath, attrs_parser): assert output == expected -@pytest.mark.parametrize("attrs_parser", ["lxml", etree_attr_skip_param]) -def test_attrs_cols_prefix(datapath, attrs_parser): +@pytest.mark.skipif( + sys.version_info <= (3, 7), + reason=("etree alpha ordered attributes <= py3.7"), +) +def test_attrs_cols_prefix(datapath, parser): expected = """\ @@ -583,8 +581,11 @@ def test_hierarchical_columns(datapath, parser): assert output == expected -@pytest.mark.parametrize("attrs_parser", ["lxml", etree_attr_skip_param]) -def test_hierarchical_attrs_columns(datapath, attrs_parser): +@pytest.mark.skipif( + sys.version_info <= (3, 7), + reason=("etree alpha ordered attributes <= py3.7"), +) +def test_hierarchical_attrs_columns(datapath, parser): expected = """\ @@ -663,8 +664,11 @@ def test_multi_index(datapath, parser): assert output == expected -@pytest.mark.parametrize("attrs_parser", ["lxml", etree_attr_skip_param]) -def test_multi_index_attrs_cols(datapath, attrs_parser): +@pytest.mark.skipif( + sys.version_info <= (3, 7), + reason=("etree alpha ordered attributes <= py3.7"), +) +def test_multi_index_attrs_cols(datapath, parser): expected = """\ @@ -941,12 +945,13 @@ def test_no_pretty_print_with_decl(parser): "
" ) - output = geom_df.to_xml(pretty_print=False) + output = geom_df.to_xml(pretty_print=False, parser=parser) output = output.replace( '", "/>") assert output == expected @@ -961,7 +966,7 @@ def test_no_pretty_print_no_decl(parser): "
" ) - output = geom_df.to_xml(xml_declaration=False, pretty_print=False) + output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser) assert output == expected @@ -1036,7 +1041,7 @@ def test_stylesheet_with_etree(datapath): @td.skip_if_installed("lxml") -def test_stylesheet_without_lxml(datapath, parser): +def test_stylesheet_without_lxml(datapath): xsl = datapath("io", "data", "xml", "row_field_output.xsl") with pytest.warns( @@ -1046,18 +1051,18 @@ def test_stylesheet_without_lxml(datapath, parser): @td.skip_if_no("lxml") -def test_stylesheet_wrong_path(datapath, parser): +def test_stylesheet_wrong_path(datapath): xsl = os.path.join("data", "xml", "row_field_output.xslt") with pytest.raises( (OSError, FileNotFoundError), - match=("failed to load external entity|No such file or directory"), + match=("failed to load external entity|No such file or directory|没有那个文件或目录"), ): geom_df.to_xml(stylesheet=xsl) @td.skip_if_no("lxml") -def test_stylesheet_not_path_buffer(parser): +def test_stylesheet_not_path_buffer(): with pytest.raises( ValueError, match=("stylesheet is not a url, file, or xml string") ): diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 53926765f00d8..36eb4e2464209 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -21,7 +21,7 @@ [X] - ValueError("names does not match length of child elements in xpath.") [X] - TypeError("...is not a valid type for names") [X] - ValueError("io is not a url, file, or xml string") -[] - URLError (GENERAL ERROR USUALLY DUE TO NETWORKING) +[] - URLError (USUALLY DUE TO NETWORKING) [X] - HTTPError("HTTP Error 404: Not Found") [X] - OSError("No such file") [] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) @@ -39,11 +39,11 @@ [X] - TypeError("...is not a valid type for names") [X] - ValueError("io is not a url, file, or xml string") [X] - LookupError(unknown encoding) -[] - URLError (GENERAL ERROR USUALLY DUE TO NETWORKING) +[] - URLError (USUALLY DUE TO NETWORKING) [X - HTTPError("HTTP Error 404: Not Found") [X] - OSError("failed to load external entity") [X] - XMLSyntaxError("Start tag expected, '<' not found") -[] - ParserError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) +[] - ParserError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) [X] - ValueError("Values for parser can only be lxml or etree.") """ @@ -228,7 +228,7 @@ def test_not_io_object(parser): def test_wrong_file_lxml(datapath): with pytest.raises( (OSError, FileNotFoundError), - match=("failed to load external entity|No such file or directory"), + match=("failed to load external entity|No such file or directory|没有那个文件或目录"), ): filename = os.path.join("data", "html", "books.xml") read_xml(filename, parser="lxml") @@ -840,7 +840,7 @@ def test_wrong_stylesheet(): with pytest.raises( (OSError, FileNotFoundError), - match=("failed to load external entity|No such file or directory"), + match=("failed to load external entity|No such file or directory|没有那个文件或目录"), ): read_xml(kml, stylesheet=xsl) From a92c21e0765f8a247ba5665039cf00639bfaa7b6 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 3 Feb 2021 12:30:20 -0600 Subject: [PATCH 06/35] Add appropriate etree skips in tests --- pandas/tests/io/formats/test_to_xml.py | 28 +++++++++++++++----------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 4aadbdb4b32e4..3480c8891d594 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -33,7 +33,7 @@ [X] - KeyError("...is not included in namespaces") [X] - KeyError("no valid column") [X] - ValueError("stylesheet is not a url, file, or xml string.") -[] - LookupError +[] - LookupError (NEED WRONG ENCODING FOR FILE OUTPUT) [] - URLError (USUALLY DUE TO NETWORKING) [] - HTTPError (NEED AN ONLINE STYLESHEET) [X] - OSError("failed to load external entity") @@ -380,7 +380,7 @@ def test_na_empty_elem_option(datapath, parser): @pytest.mark.skipif( - sys.version_info <= (3, 7), + sys.version_info < (3, 8), reason=("etree alpha ordered attributes <= py3.7"), ) def test_attrs_cols_nan_output(datapath, parser): @@ -404,7 +404,7 @@ def test_attrs_cols_nan_output(datapath, parser): @pytest.mark.skipif( - sys.version_info <= (3, 7), + sys.version_info < (3, 8), reason=("etree alpha ordered attributes <= py3.7"), ) def test_attrs_cols_prefix(datapath, parser): @@ -582,7 +582,7 @@ def test_hierarchical_columns(datapath, parser): @pytest.mark.skipif( - sys.version_info <= (3, 7), + sys.version_info < (3, 8), reason=("etree alpha ordered attributes <= py3.7"), ) def test_hierarchical_attrs_columns(datapath, parser): @@ -665,7 +665,7 @@ def test_multi_index(datapath, parser): @pytest.mark.skipif( - sys.version_info <= (3, 7), + sys.version_info < (3, 8), reason=("etree alpha ordered attributes <= py3.7"), ) def test_multi_index_attrs_cols(datapath, parser): @@ -906,7 +906,8 @@ def test_misspelled_encoding(parser): # PRETTY PRINT -def test_xml_declaration_pretty_print(parser): +@td.skip_if_no("lxml") +def test_xml_declaration_pretty_print(): expected = """\ @@ -929,12 +930,13 @@ def test_xml_declaration_pretty_print(parser): """ - output = geom_df.to_xml(xml_declaration=False, parser=parser) + output = geom_df.to_xml(xml_declaration=False) assert output == expected -def test_no_pretty_print_with_decl(parser): +@td.skip_if_no("lxml") +def test_no_pretty_print_with_decl(): expected = ( "\n" "0square" @@ -945,7 +947,7 @@ def test_no_pretty_print_with_decl(parser): "" ) - output = geom_df.to_xml(pretty_print=False, parser=parser) + output = geom_df.to_xml(pretty_print=False) output = output.replace( '0square" "3604.0" @@ -966,7 +969,7 @@ def test_no_pretty_print_no_decl(parser): "" ) - output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser) + output = geom_df.to_xml(xml_declaration=False, pretty_print=False) assert output == expected @@ -1031,7 +1034,7 @@ def test_stylesheet_buffered_reader(datapath, mode): assert output == xsl_expected -def test_stylesheet_with_etree(datapath): +def test_stylesheet_with_etree_parser(datapath): xsl = datapath("io", "data", "xml", "row_field_output.xsl") with pytest.warns( @@ -1127,6 +1130,7 @@ def test_incorrect_xsl_eval(): geom_df.to_xml(stylesheet=xsl) +@td.skip_if_no("lxml") def test_incorrect_xsl_apply(parser): from lxml.etree import XSLTApplyError From 51f10f207747dd83f796f4e3f33ecb9cc3113c8a Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 3 Feb 2021 16:58:54 -0600 Subject: [PATCH 07/35] Remove check for warnings in tests --- pandas/tests/io/formats/test_to_xml.py | 30 -------------------------- pandas/tests/io/test_xml.py | 12 ----------- 2 files changed, 42 deletions(-) diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 3480c8891d594..791f5cdd48970 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -21,8 +21,6 @@ [X] - LookupError("unknown encoding") [X] - KeyError("...is not included in namespaces") [X] - KeyError("no valid column") -[X] - UserWarning("To use stylesheet, you need lxml installed.") -[X] - ImportWarning("You do not have lxml installed.") lxml [X] - TypeError("...is not a valid type for attr_cols") @@ -135,15 +133,6 @@ def parser(request): return request.param -# FAIL SAFE WARNING - - -@td.skip_if_installed("lxml") -def test_failsafe_parser(datapath): - with pytest.warns(ImportWarning, match=("You do not have lxml installed.")): - geom_df.to_xml() - - # FILE OUTPUT @@ -1034,25 +1023,6 @@ def test_stylesheet_buffered_reader(datapath, mode): assert output == xsl_expected -def test_stylesheet_with_etree_parser(datapath): - xsl = datapath("io", "data", "xml", "row_field_output.xsl") - - with pytest.warns( - UserWarning, match=("To use stylesheet, you need lxml installed.") - ): - geom_df.to_xml(parser="etree", stylesheet=xsl) - - -@td.skip_if_installed("lxml") -def test_stylesheet_without_lxml(datapath): - xsl = datapath("io", "data", "xml", "row_field_output.xsl") - - with pytest.warns( - UserWarning, match=("To use stylesheet, you need lxml installed.") - ): - geom_df.to_xml(stylesheet=xsl) - - @td.skip_if_no("lxml") def test_stylesheet_wrong_path(datapath): xsl = os.path.join("data", "xml", "row_field_output.xslt") diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 36eb4e2464209..51c14361a7cad 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -25,7 +25,6 @@ [X] - HTTPError("HTTP Error 404: Not Found") [X] - OSError("No such file") [] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) -[X] - ImportWarning("You do not have lxml installed.") lxml [X] - ValueError("Either element or attributes can be parsed not both.") @@ -99,17 +98,6 @@ def parser(request): return request.param -# FAIL SAFE WARNING - - -@td.skip_if_installed("lxml") -def test_failsafe_parser(datapath): - filename = datapath("io", "data", "xml", "books.xml") - - with pytest.warns(ImportWarning, match=("You do not have lxml installed.")): - read_xml(filename) - - # FILE / URL From 3520d58f1cbaccc8eeb483c4e46117a3a9b26556 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 4 Feb 2021 10:06:07 -0600 Subject: [PATCH 08/35] Adjust code to conform to mypy and docstring validation --- pandas/core/frame.py | 23 ++-- pandas/io/formats/format.py | 10 +- pandas/io/formats/xml.py | 203 +++++++++++++++++------------------- pandas/io/xml.py | 71 +++++++------ 4 files changed, 158 insertions(+), 149 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5c256776540a9..cb738eff0bc1a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2607,15 +2607,15 @@ def to_html( def to_xml( self, path_or_buffer: Optional[FilePathOrBuffer[str]] = None, - index: Optional[bool] = True, + index: bool = True, root_name: Optional[str] = "data", row_name: Optional[str] = "row", na_rep: Optional[str] = None, attr_cols: Optional[Union[str, List[str]]] = None, elem_cols: Optional[Union[str, List[str]]] = None, - namespaces: Optional[Union[dict, List[dict]]] = None, + namespaces: Optional[Dict[Optional[str], str]] = None, prefix: Optional[str] = None, - encoding: Optional[str] = "utf-8", + encoding: str = "utf-8", xml_declaration: Optional[bool] = True, pretty_print: Optional[bool] = True, parser: Optional[str] = "lxml", @@ -2635,7 +2635,7 @@ def to_xml( Whether to include index in XML document. root_name : str, default 'data' The name of root element in XML document. - root_name : str, default 'row' + row_name : str, default 'row' The name of row element in XML document. na_rep : str, optional Missing data representation. @@ -2654,13 +2654,13 @@ def to_xml( Default namespaces should be given empty string key. For example, :: - namespaces = {'': 'https://example.com'} + namespaces = {"": "https://example.com"} prefix : str, optional Namespace prefix to be used for every element and/or attribute in document. This should be one of the keys in ``namespaces`` dict. - encoding : str, optional, default 'utf-8' + encoding : str, default 'utf-8' Encoding of the resulting document. xml_declaration : str, optional Whether to include the XML declaration at start of document. @@ -2697,7 +2697,7 @@ def to_xml( ... 'degrees': [360, 360, 180], ... 'sides': [4, np.nan, 3]}) - >>> df.to_xml() + >>> df.to_xml() # doctest: +SKIP @@ -2720,7 +2720,9 @@ def to_xml( - >>> df.to_xml(attr_cols=['index', 'shape', 'degrees', 'sides']) + >>> df.to_xml(attr_cols=[ + ... 'index', 'shape', 'degrees', 'sides' + ... ]) # doctest: +SKIP @@ -2728,8 +2730,8 @@ def to_xml( - >>> df.to_xml(namespaces = {"doc": "https://example.com"}, - ... prefix = "doc") + >>> df.to_xml(namespaces={"doc": "https://example.com"}, + ... prefix="doc") # doctest: +SKIP @@ -2756,7 +2758,6 @@ def to_xml( formatter = fmt.DataFrameFormatter( self, index=index, - na_rep=na_rep, ) return fmt.DataFrameRenderer(formatter).to_xml( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 870e2a5976319..7788faf52b01e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1015,9 +1015,9 @@ def to_xml( na_rep: Optional[str] = None, attr_cols: Optional[Union[str, List[str]]] = None, elem_cols: Optional[Union[str, List[str]]] = None, - namespaces: Optional[Union[dict, List[dict]]] = None, + namespaces: Optional[Dict[Optional[str], str]] = None, prefix: Optional[str] = None, - encoding: Optional[str] = "utf-8", + encoding: str = "utf-8", xml_declaration: Optional[bool] = True, pretty_print: Optional[bool] = True, parser: Optional[str] = "lxml", @@ -1037,7 +1037,7 @@ def to_xml( Whether to include index in XML document. root_name : str, default 'data' The name of root element in XML document. - root_name : str, default 'row' + row_name : str, default 'row' The name of row element in XML document. na_rep : str, optional Missing data representation. @@ -1062,7 +1062,7 @@ def to_xml( Namespace prefix to be used for every element and/or attribute in document. This should be one of the keys in ``namespaces`` dict. - encoding : str, optional, default 'utf-8' + encoding : str, default 'utf-8' Encoding of the resulting document. xml_declaration : str, optional Whether to include the XML declaration at start of document. @@ -1087,6 +1087,8 @@ def to_xml( lxml = import_optional_dependency("lxml.etree", errors="ignore") + TreeBuilder: Union[Type[EtreeXMLFormatter], Type[LxmlXMLFormatter]] + if parser == "lxml": if lxml is not None: TreeBuilder = LxmlXMLFormatter diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 90ee289ad3414..45cda69efbb05 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -4,7 +4,7 @@ import codecs import io -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union from urllib.error import HTTPError, URLError from warnings import warn @@ -79,11 +79,11 @@ def __init__( root_name: Optional[str] = "data", row_name: Optional[str] = "row", na_rep: Optional[str] = None, - attr_cols: Optional[Union[str, List[str]]] = None, - elem_cols: Optional[Union[str, List[str]]] = None, - namespaces: Optional[Dict[str, str]] = None, + attr_cols: Optional[List[str]] = None, + elem_cols: Optional[List[str]] = None, + namespaces: Optional[Dict[Optional[str], str]] = None, prefix: Optional[str] = None, - encoding: Optional[str] = "utf-8", + encoding: str = "utf-8", xml_declaration: Optional[bool] = True, pretty_print: Optional[bool] = True, stylesheet: Optional[FilePathOrBuffer[str]] = None, @@ -104,6 +104,9 @@ def __init__( self.stylesheet = stylesheet self.frame = self.fmt.frame + self.orig_cols = self.fmt.frame.columns.tolist() + self.frame_dicts = self.process_dataframe() + def build_tree(self) -> bytes: """ Build tree from data. @@ -151,7 +154,7 @@ def validate_encoding(self) -> None: except LookupError as e: raise e - def process_dataframe(self) -> None: + def process_dataframe(self) -> Dict[Union[int, str], Dict[str, Any]]: """ Adjust Data Frame to fit xml output. @@ -176,13 +179,15 @@ def handle_indexes(self) -> None: This method will add indexes into attr_cols or elem_cols. """ - indexes = [x for x in self.frame_dicts[0].keys() if x not in self.orig_cols] + indexes: List[str] = [ + x for x in self.frame_dicts[0].keys() if x not in self.orig_cols + ] if self.attr_cols and self.index: - self.attr_cols = list(indexes) + self.attr_cols + self.attr_cols = indexes + self.attr_cols if self.elem_cols and self.index: - self.elem_cols = list(indexes) + self.elem_cols + self.elem_cols = indexes + self.elem_cols def get_prefix_uri(self) -> str: """ @@ -207,7 +212,7 @@ def other_namespaces(self) -> dict: prefix. """ - nmsp_dict = {} + nmsp_dict: Dict[str, str] = {} if self.namespaces and self.prefix is None: nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p != ""} @@ -238,18 +243,19 @@ def build_elems(self) -> None: def write_output(self) -> Optional[str]: xml_doc = self.build_tree() + out_str: Optional[str] = xml_doc.decode(self.encoding).rstrip() try: - if self.path_or_buffer: + if self.path_or_buffer and isinstance(self.path_or_buffer, str): with open(self.path_or_buffer, "wb") as f: f.write(xml_doc) - xml_doc = None - else: - xml_doc = xml_doc.decode(self.encoding).rstrip() + + out_str = None + except (UnicodeDecodeError, OSError) as e: raise e - return xml_doc + return out_str class EtreeXMLFormatter(BaseXMLFormatter): @@ -268,8 +274,6 @@ def __init__(self, *args, **kwargs): self.validate_columns() self.validate_encoding() - self.orig_cols = self.fmt.frame.columns.tolist() - self.frame_dicts = self.process_dataframe() self.handle_indexes() self.prefix_uri = self.get_prefix_uri() @@ -284,13 +288,12 @@ def build_tree(self) -> bytes: self.d = d self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") - if self.attr_cols: - self.build_attribs() - if self.elem_cols: - self.build_elems() if not self.attr_cols and not self.elem_cols: self.elem_cols = list(self.frame_dicts[0].keys()) self.build_elems() + else: + self.build_attribs() + self.build_elems() self.out_xml = tostring(self.root, method="xml", encoding=self.encoding) @@ -315,7 +318,8 @@ def get_prefix_uri(self) -> str: uri = "" if self.namespaces: for p, n in self.namespaces.items(): - register_namespace(p, n) + if isinstance(p, str) and isinstance(n, str): + register_namespace(p, n) if self.prefix: try: uri = f"{{{self.namespaces[self.prefix]}}}" @@ -327,40 +331,42 @@ def get_prefix_uri(self) -> str: return uri def build_attribs(self) -> None: - for col in self.attr_cols: - flat_col = col - if isinstance(col, tuple): - flat_col = ( - "".join(str(c) for c in col).strip() - if "" in col - else "_".join(str(c) for c in col).strip() - ) - - attr_name = f"{self.prefix_uri}{flat_col}" - try: - if self.d[col] is not None: - self.elem_row.attrib[attr_name] = str(self.d[col]) - except KeyError: - raise KeyError(f"no valid column, {col}") + if self.attr_cols: + for col in self.attr_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + attr_name = f"{self.prefix_uri}{flat_col}" + try: + if self.d[col] is not None: + self.elem_row.attrib[attr_name] = str(self.d[col]) + except KeyError: + raise KeyError(f"no valid column, {col}") def build_elems(self) -> None: from xml.etree.ElementTree import SubElement - for col in self.elem_cols: - flat_col = col - if isinstance(col, tuple): - flat_col = ( - "".join(str(c) for c in col).strip() - if "" in col - else "_".join(str(c) for c in col).strip() - ) - - elem_name = f"{self.prefix_uri}{flat_col}" - try: - val = None if self.d[col] in [None, ""] else str(self.d[col]) - SubElement(self.elem_row, elem_name).text = val - except KeyError: - raise KeyError(f"no valid column, {col}") + if self.elem_cols: + for col in self.elem_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + elem_name = f"{self.prefix_uri}{flat_col}" + try: + val = None if self.d[col] in [None, ""] else str(self.d[col]) + SubElement(self.elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") def prettify_tree(self) -> bytes: """ @@ -375,7 +381,7 @@ def prettify_tree(self) -> bytes: return dom.toprettyxml(indent=" ", encoding=self.encoding) - def remove_declaration(self) -> None: + def remove_declaration(self) -> bytes: """ Remove xml declaration. @@ -402,8 +408,6 @@ def __init__(self, *args, **kwargs): self.validate_columns() self.validate_encoding() - self.orig_cols = self.fmt.frame.columns.tolist() - self.frame_dicts = self.process_dataframe() self.prefix_uri = self.get_prefix_uri() self.convert_empty_str_key() @@ -424,15 +428,12 @@ def build_tree(self) -> bytes: self.d = d self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") - if self.attr_cols: - self.build_attribs() - - if self.elem_cols: - self.build_elems() - if not self.attr_cols and not self.elem_cols: self.elem_cols = list(self.frame_dicts[0].keys()) self.build_elems() + else: + self.build_attribs() + self.build_elems() self.out_xml = tostring( self.root, @@ -472,54 +473,44 @@ def get_prefix_uri(self) -> str: return uri def build_attribs(self) -> None: - """ - Create attributes of row. - - This method adds attributes using attr_cols to row element and - works with tuples for multindex or hierarchical columns. - """ - for col in self.attr_cols: - flat_col = col - if isinstance(col, tuple): - flat_col = ( - "".join(str(c) for c in col).strip() - if "" in col - else "_".join(str(c) for c in col).strip() - ) - - attr_name = f"{self.prefix_uri}{flat_col}" - try: - if self.d[col] is not None: - self.elem_row.attrib[attr_name] = self.d[col] - except KeyError: - raise KeyError(f"no valid column, {col}") + if self.attr_cols: + for col in self.attr_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + attr_name = f"{self.prefix_uri}{flat_col}" + try: + if self.d[col] is not None: + self.elem_row.attrib[attr_name] = self.d[col] + except KeyError: + raise KeyError(f"no valid column, {col}") def build_elems(self) -> None: - """ - Create child elements of row. - - This method adds child elements using elem_cols to row element and - works with tuples for multindex or hierarchical columns. - """ from lxml.etree import SubElement - for col in self.elem_cols: - flat_col = col - if isinstance(col, tuple): - flat_col = ( - "".join(str(c) for c in col).strip() - if "" in col - else "_".join(str(c) for c in col).strip() - ) - - elem_name = f"{self.prefix_uri}{flat_col}" - try: - val = None if self.d[col] in [None, ""] else str(self.d[col]) - SubElement(self.elem_row, elem_name).text = val - except KeyError: - raise KeyError(f"no valid column, {col}") + if self.elem_cols: + for col in self.elem_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + elem_name = f"{self.prefix_uri}{flat_col}" + try: + val = None if self.d[col] in [None, ""] else str(self.d[col]) + SubElement(self.elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") - def convert_io(self) -> Union[None, str]: + def convert_io(self) -> Union[bytes, str, None]: """ Convert stylesheet object to string. @@ -527,6 +518,8 @@ def convert_io(self) -> Union[None, str]: as string, depending on object type. """ + obj: Union[bytes, str, None] = None + if isinstance(self.stylesheet, str): obj = self.stylesheet @@ -577,7 +570,7 @@ def parse_doc(self): from lxml.etree import XML, XMLParser, XMLSyntaxError, parse current_doc = self.convert_io() - if current_doc: + if current_doc and isinstance(current_doc, str): is_xml = current_doc.startswith((" List[Dict[str, List[str]]]: + def parse_data(self) -> List[Dict[str, Optional[str]]]: """ Parse xml data. @@ -104,7 +104,7 @@ def parse_data(self) -> List[Dict[str, List[str]]]: raise AbstractMethodError(self) - def _parse_nodes(self) -> List[Dict[str, List[str]]]: + def _parse_nodes(self) -> List[Dict[str, Optional[str]]]: """ Parse xml nodes. @@ -157,7 +157,7 @@ def _validate_names(self) -> None: """ raise AbstractMethodError(self) - def _convert_io(self, xml_data) -> Union[None, str]: + def _convert_io(self, xml_data) -> Union[str, bytes, None]: """ Convert io object to string. @@ -165,6 +165,8 @@ def _convert_io(self, xml_data) -> Union[None, str]: as string, depending on object type. """ + obj: Union[bytes, str, None] = None + if isinstance(xml_data, str): obj = xml_data @@ -231,7 +233,7 @@ class _EtreeFrameParser(_XMLFrameParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def parse_data(self) -> List[Dict[str, List[str]]]: + def parse_data(self) -> List[Dict[str, Optional[str]]]: if self.stylesheet: warn( @@ -247,9 +249,9 @@ def parse_data(self) -> List[Dict[str, List[str]]]: return self._parse_nodes() - def _parse_nodes(self) -> List[Dict[str, List[str]]]: - + def _parse_nodes(self) -> List[Dict[str, Optional[str]]]: elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) + dicts: List[Dict[str, Optional[str]]] if self.elems_only and self.attrs_only: raise ValueError("Either element or attributes can be parsed not both.") @@ -279,7 +281,10 @@ def _parse_nodes(self) -> List[Dict[str, List[str]]]: ] elif self.attrs_only: - dicts = [el.attrib for el in elems] + dicts = [ + {k: v.strip() if v else None for k, v in el.attrib.items()} + for el in elems + ] else: if self.names: @@ -363,9 +368,9 @@ def _validate_path(self) -> None: def _validate_names(self) -> None: if self.names: - children = self.xml_doc.find( - self.xpath, namespaces=self.namespaces - ).findall("*") + parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) + if parent: + children = parent.findall("*") if is_list_like(self.names): if len(self.names) < len(children): @@ -378,20 +383,24 @@ def _validate_names(self) -> None: ) def _parse_doc(self) -> Union[Element, ElementTree]: - from xml.etree.ElementTree import ParseError, fromstring, parse + from xml.etree.ElementTree import ( + Element, + ElementTree, + ParseError, + fromstring, + parse, + ) current_doc = self._convert_io(self.io) if current_doc: - is_xml = current_doc.startswith((" List[Dict[str, List[str]]]: + def parse_data(self) -> List[Dict[str, Optional[str]]]: """ Parse xml data. @@ -444,8 +453,9 @@ def parse_data(self) -> List[Dict[str, List[str]]]: return self._parse_nodes() - def _parse_nodes(self) -> List[Dict[str, List[str]]]: + def _parse_nodes(self) -> List[Dict[str, Optional[str]]]: elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + dicts: List[Dict[str, Optional[str]]] if self.elems_only and self.attrs_only: raise ValueError("Either element or attributes can be parsed not both.") @@ -601,7 +611,10 @@ def _parse_doc(self): current_doc = self._convert_io(self.raw_doc) if current_doc: - is_xml = current_doc.startswith(("''' >>> df = pd.read_xml(xml) - >>> df shape degrees sides 0 square 360 4.0 @@ -871,10 +886,9 @@ def read_xml( ... ... ... - ... "''' + ... ''' >>> df = pd.read_xml(xml, xpath=".//row") - >>> df shape degrees sides 0 square 360 4.0 @@ -900,10 +914,9 @@ def read_xml( ... ... ''' - >>> df = pd.read(xml, - xpath="//doc:row", - namespaces = {'doc': 'https://example.com'}) - + >>> df = pd.read_xml(xml, + ... xpath="//doc:row", + ... namespaces={"doc": "https://example.com"}) >>> df shape degrees sides 0 square 360 4.0 From 483256250fdc057c4360afcb8f6a6fcff2edd069 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 4 Feb 2021 11:57:23 -0600 Subject: [PATCH 09/35] Add read_xml to TestPDApi test and fix for etree test --- pandas/tests/api/test_api.py | 1 + pandas/tests/io/test_xml.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 541c2988a0636..fd1c19219c4bf 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -159,6 +159,7 @@ class TestPDApi(Base): "read_gbq", "read_hdf", "read_html", + "read_xml", "read_json", "read_pickle", "read_sas", diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 51c14361a7cad..18e953db92c8d 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -216,7 +216,7 @@ def test_not_io_object(parser): def test_wrong_file_lxml(datapath): with pytest.raises( (OSError, FileNotFoundError), - match=("failed to load external entity|No such file or directory|没有那个文件或目录"), + match=(r"failed to load external entity|No such file or directory|没有那个文件或目录"), ): filename = os.path.join("data", "html", "books.xml") read_xml(filename, parser="lxml") From 2914c32b4d6713fbed964dbae1bbd780008e361e Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 4 Feb 2021 11:57:54 -0600 Subject: [PATCH 10/35] Add read_xml to TestPDApi test and fix for etree test --- pandas/tests/io/test_xml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 18e953db92c8d..95ac1c5ff8db3 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -216,14 +216,14 @@ def test_not_io_object(parser): def test_wrong_file_lxml(datapath): with pytest.raises( (OSError, FileNotFoundError), - match=(r"failed to load external entity|No such file or directory|没有那个文件或目录"), + match=("failed to load external entity|No such file or directory|没有那个文件或目录"), ): filename = os.path.join("data", "html", "books.xml") read_xml(filename, parser="lxml") def test_wrong_file_etree(datapath): - with pytest.raises(OSError, match=("No such file")): + with pytest.raises(OSError, match=("No such file|没有那个文件或目录")): filename = os.path.join("data", "html", "books.xml") read_xml(filename, parser="etree") From 72d0e93aec1a024cf473a8cf01bcb9a936ad6251 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 4 Feb 2021 15:15:05 -0600 Subject: [PATCH 11/35] Replace lxml ImportWarning for ImportError with added tests --- pandas/core/frame.py | 12 ++++----- pandas/io/formats/format.py | 18 +++++--------- pandas/io/formats/xml.py | 10 -------- pandas/io/xml.py | 34 ++++++-------------------- pandas/tests/io/formats/test_to_xml.py | 21 ++++++++++++++++ pandas/tests/io/test_xml.py | 13 ++++++++++ 6 files changed, 53 insertions(+), 55 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cb738eff0bc1a..78ad2c089dc3e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2631,7 +2631,7 @@ def to_xml( path_or_buffer : str, path object or file-like object, optional File to write output to. If None, the output is returned as a string. - index : bool, optional + index : bool, default True Whether to include index in XML document. root_name : str, default 'data' The name of root element in XML document. @@ -2662,17 +2662,15 @@ def to_xml( dict. encoding : str, default 'utf-8' Encoding of the resulting document. - xml_declaration : str, optional + xml_declaration : bool, default True Whether to include the XML declaration at start of document. - pretty_print : bool, optional + pretty_print : bool, default True Whether output should be pretty printed with indentation and line breaks. - parser : {'lxml','etree'}, default "lxml" + parser : {'lxml','etree'}, default 'lxml' Parser module to use for building of tree. Only 'lxml' and 'etree' are supported. With 'lxml', the ability to use XSLT - stylesheet is supported. Default parser uses 'lxml'. If - module is not installed a warning will raise and process - will continue with 'etree'. + stylesheet is supported. stylesheet : str, path object or file-like object, optional A URL, file-like object, or a raw string containing an XSLT script used to transform the raw XML output. Script should use diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7788faf52b01e..0f7ed424a5f1e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1,5 +1,5 @@ """ -Internal module for formatting output data in csv, html, +Internal module for formatting output data in csv, html, xml, and latex files. This module also applies to display formatting. """ from __future__ import annotations @@ -30,7 +30,6 @@ cast, ) from unicodedata import east_asian_width -from warnings import warn import numpy as np @@ -1033,7 +1032,7 @@ def to_xml( path_or_buffer : str, path object or file-like object, optional File to write output to. If None, the output is returned as a string. - index : bool, optional + index : bool, default True Whether to include index in XML document. root_name : str, default 'data' The name of root element in XML document. @@ -1066,15 +1065,13 @@ def to_xml( Encoding of the resulting document. xml_declaration : str, optional Whether to include the XML declaration at start of document. - pretty_print : bool, optional + pretty_print : bool, default True Whether output should be pretty printed with indentation and line breaks. parser : {'lxml','etree'}, default "lxml" Parser module to use for building of tree. Only 'lxml' and 'etree' are supported. With 'lxml', the ability to use XSLT - stylesheet is supported. Default parser uses 'lxml'. If - module is not installed a warning will raise and process - will continue with 'etree'. + stylesheet is supported. stylesheet : str, path object or file-like object, optional A URL, file-like object, or a raw string containing an XSLT script used to transform the raw XML output. Script should use @@ -1093,12 +1090,9 @@ def to_xml( if lxml is not None: TreeBuilder = LxmlXMLFormatter else: - warn( - "You do not have lxml installed (default parser). " - "Instead, etree will be used.", - ImportWarning, + raise ImportError( + "lxml not found, please install or use the etree parser." ) - TreeBuilder = EtreeXMLFormatter elif parser == "etree": TreeBuilder = EtreeXMLFormatter diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 45cda69efbb05..39ff7f7e24222 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -262,11 +262,6 @@ class EtreeXMLFormatter(BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. - - Notes - ----- - This class serves as fall back option if user does not have - ``lxml`` installed or user specifically requests ``etree`` parser. """ def __init__(self, *args, **kwargs): @@ -396,11 +391,6 @@ class LxmlXMLFormatter(BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. - - Notes - ----- - This class serves as default option. If user does not have `lxml` - installed, `to_xml` will fall back with EtreeXMLFormatter. """ def __init__(self, *args, **kwargs): diff --git a/pandas/io/xml.py b/pandas/io/xml.py index eab5e270ff835..b1c5978877cc6 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -220,12 +220,7 @@ def _parse_doc(self): class _EtreeFrameParser(_XMLFrameParser): """ Internal class to parse XML into DataFrames with the Python - standard library XML modules: `xml.etree.ElementTree`. - - Notes - ----- - This class serves as fall back option if user does not have - ``lxml`` installed or user specifically requests ``etree`` parser. + standard library XML module: `xml.etree.ElementTree`. """ from xml.etree.ElementTree import Element, ElementTree @@ -420,13 +415,6 @@ class _LxmlFrameParser(_XMLFrameParser): Internal class to parse XML into DataFrames with third-party full-featured XML library, `lxml`, that supports XPath 1.0 and XSLT 1.0. - - Notes - ----- - This is the default class called with `_EtreeFrameParser` serving - as fall back option if user does not have ``lxml`` installed. - With `lxml`, the user enjoys the full scope of funcationality and - efficiency. """ def __init__(self, *args, **kwargs): @@ -705,11 +693,7 @@ def _parse( stylesheet, ) else: - warn( - "You do not have lxml installed (default parser). " - "Instead, etree will be used.", - ImportWarning, - ) + raise ImportError("lxml not found, please install or use the etree parser.") p = _EtreeFrameParser( io, @@ -762,7 +746,7 @@ def read_xml( io : str, path object or file-like object A URL, file-like object, or raw string containing XML. - xpath : str, optional + xpath : str, optional, default './*' The XPath to parse required set of nodes for migration to DataFrame. XPath should return a collection of elements and not a single element. Note: The ``etree`` parser supports limited XPath @@ -780,11 +764,11 @@ def read_xml( namespaces = {"doc": "https://example.com"} - elems_only : bool, optional, default = False + elems_only : bool, optional, default False Parse only the child elements at the specified ``xpath``. By default, all child elements and non-empty text nodes are returned. - attrs_only : bool, optional, default = False + attrs_only : bool, optional, default False Parse only the attributes at the specified ``xpath``. By default, all attributes are returned. @@ -792,15 +776,13 @@ def read_xml( Column names for DataFrame of parsed XML data. Use this parameter to rename original element names and distinguish same named elements. - encoding : str, optional, default = 'utf-8' + encoding : str, optional, default 'utf-8' Encoding of XML document. - parser : {'lxml','etree'}, default='lxml' + parser : {'lxml','etree'}, default 'lxml' Parser module to use for retrieval of data. Only 'lxml' and 'etree' are supported. With 'lxml' more complex XPath searches - and ability to use XSLT stylesheet are supported. Default parser - uses 'lxml'. If module is not installed a warning will raise and - process will continue with 'etree'. + and ability to use XSLT stylesheet are supported. stylesheet : str, path object or file-like object A URL, file-like object, or a raw string containing an XSLT script. diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 791f5cdd48970..dd90f8292142f 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -15,7 +15,10 @@ """ CHECKLIST +[x] - ValueError("Values for parser can only be lxml or etree.") + etree +[x] - ImportError("lxml not found, please install or use the etree parser.") [X] - TypeError("...is not a valid type for attr_cols") [X] - TypeError("...is not a valid type for elem_cols") [X] - LookupError("unknown encoding") @@ -963,6 +966,24 @@ def test_no_pretty_print_no_decl(): assert output == expected +# PARSER + + +@td.skip_if_installed("lxml") +def test_default_parser_no_lxml(): + with pytest.raises( + ImportError, match=("lxml not found, please install or use the etree parser.") + ): + geom_df.to_xml() + + +def test_unknown_parser(): + with pytest.raises( + ValueError, match=("Values for parser can only be lxml or etree.") + ): + geom_df.to_xml(parser="bs4") + + # STYLESHEET xsl_expected = """\ diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 95ac1c5ff8db3..8bc195377a340 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -14,7 +14,10 @@ """ CHECK LIST +[x] - ValueError("Values for parser can only be lxml or etree.") + etree +[x] - ImportError("lxml not found, please install or use the etree parser.") [X] - ValueError("Either element or attributes can be parsed not both.") [X] - ValueError("xpath does not return any nodes...") [X] - SyntaxError("You have used an incorrect or unsupported XPath") @@ -560,6 +563,16 @@ def test_parser_consistency_with_encoding(datapath): # PARSER +@td.skip_if_installed("lxml") +def test_default_parser_no_lxml(datapath): + filename = datapath("io", "data", "xml", "books.xml") + + with pytest.raises( + ImportError, match=("lxml not found, please install or use the etree parser.") + ): + read_xml(filename) + + def test_wrong_parser(datapath): filename = datapath("io", "data", "xml", "books.xml") From b80b8ce32f40a34e370bbc5c8d3476d4cc08ad44 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Fri, 5 Feb 2021 11:18:14 -0600 Subject: [PATCH 12/35] Adjust fixture for lxml skip and add error validation in tests --- pandas/io/formats/xml.py | 2 +- pandas/io/xml.py | 11 +++++++++-- pandas/tests/io/formats/test_to_xml.py | 14 ++++++++++++-- pandas/tests/io/test_xml.py | 23 ++++++++--------------- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 39ff7f7e24222..b361748dca819 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -252,7 +252,7 @@ def write_output(self) -> Optional[str]: out_str = None - except (UnicodeDecodeError, OSError) as e: + except (UnicodeDecodeError, OSError, FileNotFoundError) as e: raise e return out_str diff --git a/pandas/io/xml.py b/pandas/io/xml.py index b1c5978877cc6..a4e010ee35f23 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -404,7 +404,7 @@ def _parse_doc(self) -> Union[Element, ElementTree]: r = fromstring(current_doc) else: r = parse(current_doc) - except (URLError, HTTPError, OSError, ParseError) as e: + except (URLError, HTTPError, OSError, FileNotFoundError, ParseError) as e: raise e return r @@ -618,7 +618,14 @@ def _parse_doc(self): r = XML(current_doc) else: r = parse(current_doc, parser=curr_parser) - except (LookupError, URLError, HTTPError, OSError, XMLSyntaxError) as e: + except ( + LookupError, + URLError, + HTTPError, + OSError, + FileNotFoundError, + XMLSyntaxError, + ) as e: raise e return r diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index dd90f8292142f..bd3e0728fc42e 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -24,13 +24,16 @@ [X] - LookupError("unknown encoding") [X] - KeyError("...is not included in namespaces") [X] - KeyError("no valid column") +[] - OSError (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) +[X] - FileNotFoundError("No such file or directory") lxml [X] - TypeError("...is not a valid type for attr_cols") [X] - TypeError("...is not a valid type for elem_cols") [X] - LookupError("unknown encoding") [] - UnicodeDecodeError (NEED NON-UTF-8 STYLESHEET) -[] - OSError (NEED UNREACHABLE LOCAL FILE PATH) +[] - OSError (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) +[X] - FileNotFoundError("No such file or directory") [X] - KeyError("...is not included in namespaces") [X] - KeyError("no valid column") [X] - ValueError("stylesheet is not a url, file, or xml string.") @@ -131,7 +134,7 @@ def mode(request): return request.param -@pytest.fixture(params=["lxml", "etree"]) +@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"]) def parser(request): return request.param @@ -190,6 +193,13 @@ def test_str_output(datapath, parser): assert output == from_file_expected +def test_wrong_file_path(parser): + with pytest.raises( + FileNotFoundError, match=("No such file or directory|没有那个文件或目录") + ): + geom_df.to_xml("/my/fake/path/output.xml") + + # INDEX diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 8bc195377a340..e792d75f1b070 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -24,9 +24,10 @@ [X] - ValueError("names does not match length of child elements in xpath.") [X] - TypeError("...is not a valid type for names") [X] - ValueError("io is not a url, file, or xml string") -[] - URLError (USUALLY DUE TO NETWORKING) +[] - URLError (GENERAL ERROR WITH HTTPError AS SUBCLASS) [X] - HTTPError("HTTP Error 404: Not Found") -[X] - OSError("No such file") +[] - OSError (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) +[X] - FileNotFoundError("No such file or directory") [] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) lxml @@ -96,7 +97,7 @@ def mode(request): return request.param -@pytest.fixture(params=["lxml", "etree"]) +@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"]) def parser(request): return request.param @@ -209,26 +210,18 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) -@td.skip_if_no("lxml") -def test_not_io_object(parser): +def test_wrong_io_object(parser): with pytest.raises(ValueError, match=("io is not a url, file, or xml string")): - read_xml(DataFrame, parser="lxml") + read_xml(DataFrame, parser=parser) -@td.skip_if_no("lxml") -def test_wrong_file_lxml(datapath): +def test_wrong_file_path(datapath, parser): with pytest.raises( (OSError, FileNotFoundError), match=("failed to load external entity|No such file or directory|没有那个文件或目录"), ): filename = os.path.join("data", "html", "books.xml") - read_xml(filename, parser="lxml") - - -def test_wrong_file_etree(datapath): - with pytest.raises(OSError, match=("No such file|没有那个文件或目录")): - filename = os.path.join("data", "html", "books.xml") - read_xml(filename, parser="etree") + read_xml(filename, parser=parser) @tm.network From a6cfc90eccff7754e87d4f7093d29ff7c8f72812 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Fri, 5 Feb 2021 12:21:04 -0600 Subject: [PATCH 13/35] Add conditional skips for envs without lxml --- pandas/tests/io/formats/test_to_xml.py | 15 ++++++++------- pandas/tests/io/test_xml.py | 7 ++++++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index bd3e0728fc42e..079cc6bf0fa39 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -182,7 +182,7 @@ def test_str_output(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) - output = df_file.to_xml() + output = df_file.to_xml(parser=parser) # etree and lxml differs on quotes and case in xml declaration output = output.replace( @@ -197,7 +197,7 @@ def test_wrong_file_path(parser): with pytest.raises( FileNotFoundError, match=("No such file or directory|没有那个文件或目录") ): - geom_df.to_xml("/my/fake/path/output.xml") + geom_df.to_xml("/my/fake/path/output.xml", parser=parser) # INDEX @@ -871,7 +871,7 @@ def test_encoding_option_str(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") df_file = read_xml(filename, parser=parser, encoding="ISO-8859-1").head(5) - output = df_file.to_xml(encoding="ISO-8859-1") + output = df_file.to_xml(encoding="ISO-8859-1", parser=parser) # etree and lxml differs on quotes and case in xml declaration output = output.replace( @@ -885,16 +885,17 @@ def test_encoding_option_str(datapath, parser): @td.skip_if_no("lxml") def test_correct_encoding_file(datapath): filename = datapath("io", "data", "xml", "baby_names.xml") - df_file = read_xml(filename, encoding="ISO-8859-1") + df_file = read_xml(filename, encoding="ISO-8859-1", parser="lxml") with tm.ensure_clean("test.xml") as path: - df_file.to_xml(path, index=False, encoding="ISO-8859-1") + df_file.to_xml(path, index=False, encoding="ISO-8859-1", parser="lxml") +@td.skip_if_no("lxml") @pytest.mark.parametrize("encoding", ["UTF-8", "UTF-16", "ISO-8859-1"]) def test_wrong_encoding_option_lxml(datapath, parser, encoding): filename = datapath("io", "data", "xml", "baby_names.xml") - df_file = read_xml(filename, encoding="ISO-8859-1") + df_file = read_xml(filename, encoding="ISO-8859-1", parser="lxml") with tm.ensure_clean("test.xml") as path: df_file.to_xml(path, index=False, encoding=encoding, parser=parser) @@ -902,7 +903,7 @@ def test_wrong_encoding_option_lxml(datapath, parser, encoding): def test_misspelled_encoding(parser): with pytest.raises(LookupError, match=("unknown encoding")): - geom_df.to_xml(parser=parser, encoding="uft-8") + geom_df.to_xml(encoding="uft-8", parser=parser) # PRETTY PRINT diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index e792d75f1b070..cd5738d259eb9 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -253,7 +253,8 @@ def test_wrong_url(parser): # XPATH -def test_empty_xpath_lxml(datapath, parser): +@td.skip_if_no("lxml") +def test_empty_xpath_lxml(datapath): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(ValueError, match=("xpath does not return any nodes")): read_xml(filename, xpath=".//python", parser="lxml") @@ -317,6 +318,7 @@ def test_prefix_namespace(parser): tm.assert_frame_equal(df_nmsp, df_expected) +@td.skip_if_no("lxml") def test_consistency_default_namespace(): df_lxml = read_xml( xml_default_nmsp, @@ -335,6 +337,7 @@ def test_consistency_default_namespace(): tm.assert_frame_equal(df_lxml, df_etree) +@td.skip_if_no("lxml") def test_consistency_prefix_namespace(): df_lxml = read_xml( xml_prefix_nmsp, @@ -442,6 +445,7 @@ def test_elem_and_attrs_only(datapath, parser): read_xml(filename, elems_only=True, attrs_only=True, parser=parser) +@td.skip_if_no("lxml") def test_attribute_centric_xml(): xml = """\ @@ -545,6 +549,7 @@ def test_ascii_encoding(datapath, parser): read_xml(filename, encoding="ascii", parser=parser) +@td.skip_if_no("lxml") def test_parser_consistency_with_encoding(datapath): filename = datapath("io", "data", "xml", "baby_names.xml") df_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") From 6c4e0b4c2d2d9524c93b525eeeddd24181eeffb2 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Fri, 5 Feb 2021 15:22:24 -0600 Subject: [PATCH 14/35] Clean up whatnew rst of rebase issue --- doc/source/whatsnew/v1.3.0.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8b32bdeb0deea..3187c4dfdb7b9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -41,10 +41,6 @@ See ref:`window.overview` for performance and functional benefits. (:issue:`1509 .. _whatsnew_130.read_to_xml: -We added I/O support to read and render shallow versions of XML documents with -:func:`pandas.read_xml` and :meth:`DataFrame.to_xml`. Using lxml as parser, -full XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) -======= We added I/O support to read and render shallow versions of `XML`_ documents with :func:`pandas.read_xml` and :meth:`DataFrame.to_xml`. Using `lxml`_ as parser, full XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) From a57fd35e75d8150601e2feca351010f809ede674 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Fri, 5 Feb 2021 16:45:02 -0600 Subject: [PATCH 15/35] Fix unescaped emphasis and wording in read_xml docstring --- pandas/io/xml.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a4e010ee35f23..cb66bd79f4c48 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -674,14 +674,11 @@ def _parse( Raises ------ - ValueError - * If parser is not lxml or etree.e. + ImportError + * If lxml is not installed if selected as parser. - Notes - ----- - This method will raise a warning instead of module not found or - import error if user does not have 1xml and then reverts to - fallback option with etree parser. + ValueError + * If parser is not lxml or etree. """ lxml = import_optional_dependency("lxml.etree", errors="ignore") @@ -753,7 +750,7 @@ def read_xml( io : str, path object or file-like object A URL, file-like object, or raw string containing XML. - xpath : str, optional, default './*' + xpath : str, optional, default './\*' The XPath to parse required set of nodes for migration to DataFrame. XPath should return a collection of elements and not a single element. Note: The ``etree`` parser supports limited XPath @@ -766,8 +763,8 @@ def read_xml( namespaces in XML, only the ones used in ``xpath`` expression. Note: if XML document uses default namespace denoted as `xmlns=''` without a prefix, you must assign any temporary - namespace, like 'doc', to URI in order to parse any underlying - nodes. For example, :: + namespace prefix such as 'doc' to the URI in order to parse + underlying nodes and/or attributes. For example, :: namespaces = {"doc": "https://example.com"} @@ -793,12 +790,12 @@ def read_xml( stylesheet : str, path object or file-like object A URL, file-like object, or a raw string containing an XSLT script. - This stylesheet should flatten complex, deeply nested XML documents. - To use this feature you must have ``lxml`` module installed and use - 'lxml' as ``parser``. The ``xpath`` must reference nodes of - transformed XML document generated after XSLT transformation and not - the original XML document. Only XSLT 1.0 scripts and not later - versions is currently supported. + This stylesheet should flatten complex, deeply nested XML documents + for easier parsing. To use this feature you must have ``lxml`` module + installed and specify 'lxml' as ``parser``. The ``xpath`` must + reference nodes of transformed XML document generated after XSLT + transformation and not the original XML document. Only XSLT 1.0 + scripts and not later versions is currently supported. Returns ------- From 23439b49beafbbab843396ba4358263d4d8f5150 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 7 Feb 2021 19:44:14 -0600 Subject: [PATCH 16/35] Add XML section in io.rst and lxml dependency for read_xml in install.rst --- doc/source/getting_started/install.rst | 2 +- doc/source/user_guide/io.rst | 445 +++++++++++++++++++++++++ 2 files changed, 446 insertions(+), 1 deletion(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 49039f05b889a..d49c2698a1ace 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -271,7 +271,7 @@ fsspec 0.7.4 Handling files aside from local and fastparquet 0.4.0 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access html5lib 1.0.1 HTML parser for read_html (see :ref:`note `) -lxml 4.3.0 HTML parser for read_html (see :ref:`note `) +lxml 4.3.0 HTML parser for read_html (see :ref:`note `); XML parser for read_xml matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations openpyxl 2.6.0 Reading / writing for xlsx files diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d6934a3ca2a6c..185432d3c09d4 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -22,6 +22,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;Fixed-Width Text File;:ref:`read_fwf` text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` + text;`XML `__;:ref:`read_xml`;:ref:`to_xml` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` binary;`OpenDocument `__;:ref:`read_excel`; @@ -2831,6 +2832,450 @@ parse HTML tables in the top-level pandas io function ``read_html``. +XML +--- + +.. _io.read_xml: + +Reading XML +''''''''''' + +.. versionadded:: 1.3.0 + +The top-level :func:`~pandas.io.xml.read_xml` function can accept an XML +string/file/URL and will parse nodes and attributes into a pandas ``DataFrame``. + +.. note:: + + Since there is no standard XML structure where design types can vary in + many ways, ``read_xml`` works best with flatter, shallow versions. If + an XML document is deeply nested, use the ``stylesheet`` feature to + transform XML into a flatter version. + +Let's look at a few examples. + +Read an XML string: + +.. ipython:: python + + xml = """ + + + Everyday Italian + Giada De Laurentiis + 2005 + 30.00 + + + Harry Potter + J K. Rowling + 2005 + 29.99 + + + Learning XML + Erik T. Ray + 2003 + 39.95 + + """ + + df = pd.read_xml(xml) + df + +Read a URL with no options: + +.. ipython:: python + + df = pd.read_xml("https://www.w3schools.com/xml/books.xml") + df + +Read in the content of the "books.xml" file and pass it to ``read_xml`` +as a string: + +.. ipython:: python + :suppress: + + rel_path = os.path.join("..", "pandas", "tests", "io", "data", "xml", + "books.xml") + file_path = os.path.abspath(rel_path) + +.. ipython:: python + + with open(file_path, "r") as f: + df = pd.read_xml(f.read()) + df + +Read in the content of the "books.xml" as instance of ``StringIO`` or +``BytesIO`` and pass it to ``read_xml``: + +.. ipython:: python + + with open(file_path, "r") as f: + sio = StringIO(f.read()) + + df = pd.read_xml(sio) + df + +.. ipython:: python + + with open(file_path, "rb") as f: + bio = BytesIO(f.read()) + + df = pd.read_xml(bio) + df + +With `lxml`_ as default ``parser``, you access the full-featured XML library +that extends Python's ElementTree API. One powerful tool is ability to query +nodes selectively or conditionally with more expressive XPath: + +.. _lxml: https://lxml.de + +.. ipython:: python + + df = pd.read_xml(file_path, xpath="//book[year=2005]") + df + +Specify only elements or only attributes to parse: + +.. ipython:: python + + df = pd.read_xml(file_path, elems_only=True) + df + +.. ipython:: python + + df = pd.read_xml(file_path, attrs_only=True) + df + +XML documents can have namespaces with prefixes and default namespaces without +prefixes both of which are denoted with a special attribute ``xmlns``. In order +to parse by node under a namespace context, ``xpath`` must reference a prefix. + +For example, below XML contains a namespace with prefix, ``doc``, and URI at +``https://example.com``. In order to parse ``doc:row`` nodes, +``namespaces`` must be used. + +.. ipython:: python + + xml = """ + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + + """ + + df = pd.read_xml(xml, + xpath="//doc:row", + namespaces={"doc": "https://example.com"}) + df + +Similarly, an XML document can have a default namespace without prefix. Failing +to assign a temporary prefix will return no nodes and raise a ``ValueError``. +But assiging *any* temporary name to correct URI allows parsing by nodes. + +.. ipython:: python + + xml = """ + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + + """ + + df = pd.read_xml(xml, + xpath="//pandas:row", + namespaces={"pandas": "https://example.com"}) + df + +However, if XPath does not reference node names such as default, ``/\*``, then +``namespaces`` is not required. + +With `lxml`_ as parser, you can flatten nested XML documents with an XSLT +script which also can be string/file/URL types. As background, `XSLT`_ is +a special-purpose language written in a special XML file that can transform +original XML documents into other XML, HTML, even text (CSV, JSON, etc.) +using an XSLT processor. + +.. _lxml: https://lxml.de +.. _XSLT: https://www.w3.org/TR/xslt/ + +For example, consider this somewhat nested structure of Chicago "L" Rides +where station and rides elements encapsulate data in their own sections. +With below XSLT, ``lxml`` can transform original nested document into a flatter +output (as shown below for demonstration) for easier parse into ``DataFrame``: + +.. ipython:: python + + xml = """ + + + + 2020-09-01T00:00:00 + + 864.2 + 534 + 417.2 + + + + + 2020-09-01T00:00:00 + + 2707.4 + 1909.8 + 1438.6 + + + + + 2020-09-01T00:00:00 + + 2949.6 + 1657 + 1453.8 + + + """ + + xsl = """ + + + + + + + + + + + + + + + """ + + output = """ + + + 40850 + Library + 2020-09-01T00:00:00 + 864.2 + 534 + 417.2 + + + 41700 + Washington/Wabash + 2020-09-01T00:00:00 + 2707.4 + 1909.8 + 1438.6 + + + 40380 + Clark/Lake + 2020-09-01T00:00:00 + 2949.6 + 1657 + 1453.8 + + """ + + df = pd.read_xml(xml, stylesheet=xsl) + df + + +.. _io.xml: + +Writing XML +''''''''''' + +.. versionadded:: 1.3.0 + +``DataFrame`` objects have an instance method ``to_xml`` which renders the +contents of the ``DataFrame`` as an XML document. + +.. note:: + + This method does not support special properties of XML including DTD, + CData, XSD schemas, processing instructions, comments, and others. + Only namespaces at the root level is supported. However, ``stylesheet`` + allows design changes after initial output. + +Let's look at a few examples. + +Write an XML without options: + +.. ipython:: python + + geom_df = pd.DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } + ) + + print(geom_df.to_xml()) + + +Write an XML with new root and row name: + +.. ipython:: python + + print(geom_df.to_xml(root_name="geometry", row_name="objects")) + +Write an attribute-centric XML: + +.. ipython:: python + + print(geom_df.to_xml(attr_cols=geom_df.columns.tolist())) + +Write a mix of elements and attributes: + +.. ipython:: python + + print( + geom_df.to_xml( + index=False, + attr_cols=['shape'], + elem_cols=['degrees', 'sides']) + ) + +Any ``DataFrames`` with hierarchical columns will be flattened for XML element names +with levels delimited by underscores: + +.. ipython:: python + + ext_geom_df = pd.DataFrame( + { + "type": ["polygon", "other", "polygon"], + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } + ) + + pvt_df = ext_geom_df.pivot_table(index='shape', + columns='type', + values=['degrees', 'sides'], + aggfunc='sum') + pvt_df + + print(pvt_df.to_xml()) + +Write an XML with default namespace: + +.. ipython:: python + + print(geom_df.to_xml(namespaces={"": "https://example.com"})) + +Write an XML with namespace prefix: + +.. ipython:: python + + print( + geom_df.to_xml(namespaces={"doc": "https://example.com"}, + prefix="doc") + ) + +Write an XML without declaration or pretty print: + +.. ipython:: python + + print( + geom_df.to_xml(xml_declaration=False, + pretty_print=False) + ) + +Write an XML and transform with stylesheet: + +.. ipython:: python + + xsl = """ + + + + + + + + + + + polygon + + + + + + + + """ + + print(geom_df.to_xml(stylesheet=xsl)) + + +XML Final Notes +''''''''''''''' + +* All XML documents adhere to `W3C specifications`_. Both ``etree`` and ``lxml`` + parsers will fail to parse any markup document that is not well-formed or + follows XML syntax rules. Do be aware HTML is not an XML document unless it + follows XHTML specs. However, other popular markup types including KML, XAML, + RSS, MusicML, MathML are compliant `XML schemas`_. + +* For above reason, if your application builds XML prior to pandas operations, + use appropriate DOM libraries like ``etree`` and ``lxml`` to build the necessary + document and not by string concatenation or regex adjustments. Always remember + XML is a *special* and not any text file. + +* With very large XML files (several hundred MBs to GBs), XPath and XSLT + can become memory-intensive operations. Be sure to have enough available + RAM for reading and writing to large XML files (roughly about 5 times the + size of text). + +* Because XSLT is a programming language, use it with caution since such scripts + can pose a security risk in your environment and can run large or infinite + recursive operations. Always test scripts on small fragments before full run. + +* The `etree`_ parser supports all functionality of both ``read_xml`` and + ``to_xml`` except for complex XPath and any XSLT. Though limited in features, + ``etree`` is still a reliable and capable parser and tree builder. Its + performance may trail ``lxml`` to a certain degree for larger files but + relatively unnoticeable on small to medium size files. + +.. _`W3C specifications`: https://www.w3.org/TR/xml/ +.. _`XML schemas`: https://en.wikipedia.org/wiki/List_of_types_of_XML_schemas +.. _`etree`: https://docs.python.org/3/library/xml.etree.elementtree.html + + .. _io.excel: From 2effae0a9cfba39fba48b92b152dcd5a8799867d Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 10 Feb 2021 13:12:37 -0600 Subject: [PATCH 17/35] Add section title in whatsnew and tree builder for lxml dependency in install.rst --- doc/source/getting_started/install.rst | 3 ++- doc/source/whatsnew/v1.3.0.rst | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index d49c2698a1ace..aac713f29f16c 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -271,7 +271,8 @@ fsspec 0.7.4 Handling files aside from local and fastparquet 0.4.0 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access html5lib 1.0.1 HTML parser for read_html (see :ref:`note `) -lxml 4.3.0 HTML parser for read_html (see :ref:`note `); XML parser for read_xml +lxml 4.3.0 | HTML parser for read_html (see :ref:`note `) + | XML parser for read_xml and tree builder for to_xml matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations openpyxl 2.6.0 Reading / writing for xlsx files diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 77867b331f596..07bcd4b024693 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -41,9 +41,12 @@ See ref:`window.overview` for performance and functional benefits. (:issue:`1509 .. _whatsnew_130.read_to_xml: +Read and write XML documents +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + We added I/O support to read and render shallow versions of `XML`_ documents with :func:`pandas.read_xml` and :meth:`DataFrame.to_xml`. Using `lxml`_ as parser, -full XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) +both XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) .. _XML: https://www.w3.org/standards/xml/core .. _lxml: https://lxml.de @@ -101,7 +104,7 @@ full XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) -For more, see :ref:`io` in the user guide on IO tools. +For more, see :ref:`io.xml` in the user guide on IO tools. .. _whatsnew_130.enhancements.other: From 35fa6a6b4e0e99217d05a1e5786559a8544dd890 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 10 Feb 2021 22:52:11 -0600 Subject: [PATCH 18/35] Clean up merge issue in whatsnew, remove escape in io.rst, adjust exceptions with added tests --- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v1.3.0.rst | 6 ------ pandas/io/formats/xml.py | 9 +++------ pandas/io/xml.py | 7 ++----- pandas/tests/io/formats/test_to_xml.py | 24 +++++++++++++++++++++--- pandas/tests/io/test_xml.py | 11 +++++++++++ 6 files changed, 38 insertions(+), 21 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 185432d3c09d4..28f7b30974e1a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3012,7 +3012,7 @@ But assiging *any* temporary name to correct URI allows parsing by nodes. namespaces={"pandas": "https://example.com"}) df -However, if XPath does not reference node names such as default, ``/\*``, then +However, if XPath does not reference node names such as default, ``/*``, then ``namespaces`` is not required. With `lxml`_ as parser, you can flatten nested XML documents with an XSLT diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ecf5ca481ab13..edc42ee1552ec 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -33,12 +33,6 @@ For example: storage_options=headers ) -.. _whatsnew_130.window_method_table: - -:class:`Rolling` and :class:`Expanding` now support a ``method`` argument with a -``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. -See ref:`window.overview` for performance and functional benefits. (:issue:`15095`) - .. _whatsnew_130.read_to_xml: Read and write XML documents diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index b361748dca819..cd3fa80b66e0f 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -6,7 +6,6 @@ import io from typing import Any, Dict, List, Optional, Union from urllib.error import HTTPError, URLError -from warnings import warn from pandas._typing import FilePathOrBuffer from pandas.errors import AbstractMethodError @@ -252,7 +251,7 @@ def write_output(self) -> Optional[str]: out_str = None - except (UnicodeDecodeError, OSError, FileNotFoundError) as e: + except (OSError, FileNotFoundError) as e: raise e return out_str @@ -299,10 +298,8 @@ def build_tree(self) -> bytes: self.out_xml = self.remove_declaration() if self.stylesheet: - warn( - "To use stylesheet, you need lxml installed. " - "Instead, the non-transformed, original XML is returned.", - UserWarning, + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." ) return self.out_xml diff --git a/pandas/io/xml.py b/pandas/io/xml.py index cb66bd79f4c48..cd62b02f7f095 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -6,7 +6,6 @@ import io from typing import Dict, List, Optional, Union from urllib.error import HTTPError, URLError -from warnings import warn from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency @@ -231,10 +230,8 @@ def __init__(self, *args, **kwargs): def parse_data(self) -> List[Dict[str, Optional[str]]]: if self.stylesheet: - warn( - "To use stylesheet, you need lxml installed. " - "Nodes will be parsed on original XML at the xpath.", - UserWarning, + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." ) self.xml_doc = self._parse_doc() diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 079cc6bf0fa39..5144d13401e73 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -24,15 +24,15 @@ [X] - LookupError("unknown encoding") [X] - KeyError("...is not included in namespaces") [X] - KeyError("no valid column") -[] - OSError (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) +[X] - ValueError("To use stylesheet, you need lxml installed...") +[] - OSError (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) [X] - FileNotFoundError("No such file or directory") lxml [X] - TypeError("...is not a valid type for attr_cols") [X] - TypeError("...is not a valid type for elem_cols") [X] - LookupError("unknown encoding") -[] - UnicodeDecodeError (NEED NON-UTF-8 STYLESHEET) -[] - OSError (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) +[] - OSError (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) [X] - FileNotFoundError("No such file or directory") [X] - KeyError("...is not included in namespaces") [X] - KeyError("no valid column") @@ -1153,6 +1153,24 @@ def test_incorrect_xsl_apply(parser): geom_df.to_xml(path, stylesheet=xsl) +def test_stylesheet_with_etree(datapath): + xsl = """\ + + + + + + + + + """ + + with pytest.raises( + ValueError, match=("To use stylesheet, you need lxml installed") + ): + geom_df.to_xml(parser="etree", stylesheet=xsl) + + @td.skip_if_no("lxml") def test_style_to_csv(): xsl = """\ diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index cd5738d259eb9..ef695cb12fc6a 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -24,6 +24,7 @@ [X] - ValueError("names does not match length of child elements in xpath.") [X] - TypeError("...is not a valid type for names") [X] - ValueError("io is not a url, file, or xml string") +[X] - ValueError("To use stylesheet, you need lxml installed...") [] - URLError (GENERAL ERROR WITH HTTPError AS SUBCLASS) [X] - HTTPError("HTTP Error 404: Not Found") [] - OSError (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) @@ -844,6 +845,16 @@ def test_wrong_stylesheet(): read_xml(kml, stylesheet=xsl) +def test_stylesheet_with_etree(datapath): + kml = os.path.join("data", "xml", "cta_rail_lines.kml") + xsl = os.path.join("data", "xml", "flatten_doc.xsl") + + with pytest.raises( + ValueError, match=("To use stylesheet, you need lxml installed") + ): + read_xml(kml, parser="etree", stylesheet=xsl) + + @tm.network @td.skip_if_no("lxml") def test_online_stylesheet(): From 947840a32b154706c532c7a148ec91bd930b6b22 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 15 Feb 2021 22:32:03 -0600 Subject: [PATCH 19/35] Remove redundant try/except and fix default namespace condition --- pandas/io/formats/xml.py | 53 +++++++------------ pandas/io/xml.py | 110 ++++++++++++++------------------------- 2 files changed, 59 insertions(+), 104 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index cd3fa80b66e0f..ea18ca851e8c7 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -5,7 +5,6 @@ import codecs import io from typing import Any, Dict, List, Optional, Union -from urllib.error import HTTPError, URLError from pandas._typing import FilePathOrBuffer from pandas.errors import AbstractMethodError @@ -148,10 +147,7 @@ def validate_encoding(self) -> None: * If encoding is not available in codecs. """ - try: - codecs.lookup(self.encoding) - except LookupError as e: - raise e + codecs.lookup(self.encoding) def process_dataframe(self) -> Dict[Union[int, str], Dict[str, Any]]: """ @@ -244,15 +240,11 @@ def write_output(self) -> Optional[str]: xml_doc = self.build_tree() out_str: Optional[str] = xml_doc.decode(self.encoding).rstrip() - try: - if self.path_or_buffer and isinstance(self.path_or_buffer, str): - with open(self.path_or_buffer, "wb") as f: - f.write(xml_doc) + if self.path_or_buffer and isinstance(self.path_or_buffer, str): + with open(self.path_or_buffer, "wb") as f: + f.write(xml_doc) - out_str = None - - except (OSError, FileNotFoundError) as e: - raise e + out_str = None return out_str @@ -315,7 +307,7 @@ def get_prefix_uri(self) -> str: if self.prefix: try: uri = f"{{{self.namespaces[self.prefix]}}}" - except (KeyError): + except KeyError: raise KeyError(f"{self.prefix} is not included in namespaces") else: uri = f'{{{self.namespaces[""]}}}' @@ -452,7 +444,7 @@ def get_prefix_uri(self) -> str: if self.prefix: try: uri = f"{{{self.namespaces[self.prefix]}}}" - except (KeyError): + except KeyError: raise KeyError(f"{self.prefix} is not included in namespaces") else: uri = f'{{{self.namespaces[""]}}}' @@ -554,7 +546,7 @@ def parse_doc(self): * If io object is not readable as string or file-like object. """ - from lxml.etree import XML, XMLParser, XMLSyntaxError, parse + from lxml.etree import XML, XMLParser, parse current_doc = self.convert_io() if current_doc and isinstance(current_doc, str): @@ -562,18 +554,15 @@ def parse_doc(self): else: raise ValueError("stylesheet is not a url, file, or xml string") - try: - curr_parser = XMLParser(encoding=self.encoding) + curr_parser = XMLParser(encoding=self.encoding) - if is_url(current_doc): - with urlopen(current_doc) as f: - r = parse(f, parser=curr_parser) - elif is_xml: - r = XML(bytes(current_doc, encoding=self.encoding)) - else: - r = parse(current_doc, parser=curr_parser) - except (LookupError, URLError, HTTPError, OSError, XMLSyntaxError) as e: - raise e + if is_url(current_doc): + with urlopen(current_doc) as f: + r = parse(f, parser=curr_parser) + elif is_xml: + r = XML(bytes(current_doc, encoding=self.encoding)) + else: + r = parse(current_doc, parser=curr_parser) return r @@ -584,15 +573,11 @@ def transform_doc(self) -> bytes: This method will transform built tree with XSLT script. """ - from lxml.etree import XSLT, XSLTApplyError, XSLTParseError + from lxml.etree import XSLT xsl_doc = self.parse_doc() - try: - transformer = XSLT(xsl_doc) - new_doc = transformer(self.root) - - except (XSLTApplyError, XSLTParseError) as e: - raise e + transformer = XSLT(xsl_doc) + new_doc = transformer(self.root) return bytes(new_doc) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index cd62b02f7f095..9048de3fdb401 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1,11 +1,9 @@ """ :mod:`pandas.io.xml` is a module for reading XML. - """ import io from typing import Dict, List, Optional, Union -from urllib.error import HTTPError, URLError from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency @@ -313,11 +311,9 @@ def _parse_nodes(self) -> List[Dict[str, Optional[str]]]: for el in elems ] - if self.namespaces: - dicts = [ - {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} - for d in dicts - ] + dicts = [ + {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts + ] keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] @@ -375,13 +371,7 @@ def _validate_names(self) -> None: ) def _parse_doc(self) -> Union[Element, ElementTree]: - from xml.etree.ElementTree import ( - Element, - ElementTree, - ParseError, - fromstring, - parse, - ) + from xml.etree.ElementTree import Element, ElementTree, fromstring, parse current_doc = self._convert_io(self.io) if current_doc: @@ -393,16 +383,13 @@ def _parse_doc(self) -> Union[Element, ElementTree]: raise ValueError("io is not a url, file, or xml string.") r: Union[Element, ElementTree] - try: - if is_url(current_doc): - with urlopen(current_doc) as f: - r = parse(f) - elif is_xml: - r = fromstring(current_doc) - else: - r = parse(current_doc) - except (URLError, HTTPError, OSError, FileNotFoundError, ParseError) as e: - raise e + if is_url(current_doc): + with urlopen(current_doc) as f: + r = parse(f) + elif is_xml: + r = fromstring(current_doc) + else: + r = parse(current_doc) return r @@ -531,36 +518,29 @@ def _transform_doc(self): am ideally flatter xml document for easier parsing and migration to Data Frame. """ - from lxml.etree import XSLT, XSLTApplyError, XSLTParseError + from lxml.etree import XSLT - try: - transformer = XSLT(self.xsl_doc) - new_doc = transformer(self.xml_doc) - except (XSLTApplyError, XSLTParseError) as e: - raise e + transformer = XSLT(self.xsl_doc) + new_doc = transformer(self.xml_doc) return new_doc def _validate_path(self) -> None: - from lxml.etree import XPathEvalError, XPathSyntaxError - try: - elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) - children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces) - attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces) - - if (elems == [] and attrs == [] and children == []) or ( - elems != [] and attrs == [] and children == [] - ): - raise ValueError( - "xpath does not return any nodes. " - "Be sure row level nodes are in xpath. " - "If document uses namespaces denoted with " - "xmlns, be sure to define namespaces and " - "use them in xpath." - ) - except (XPathEvalError, XPathSyntaxError, TypeError) as e: - raise e + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) + children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces) + attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces) + + if (elems == [] and attrs == [] and children == []) or ( + elems != [] and attrs == [] and children == [] + ): + raise ValueError( + "xpath does not return any nodes. " + "Be sure row level nodes are in xpath. " + "If document uses namespaces denoted with " + "xmlns, be sure to define namespaces and " + "use them in xpath." + ) def _validate_names(self) -> None: """ @@ -590,7 +570,7 @@ def _validate_names(self) -> None: ) def _parse_doc(self): - from lxml.etree import XML, XMLParser, XMLSyntaxError, parse + from lxml.etree import XML, XMLParser, parse self.raw_doc = self.stylesheet if self.is_style else self.io @@ -603,27 +583,17 @@ def _parse_doc(self): else: raise ValueError("io is not a url, file, or xml string.") - try: - curr_parser = XMLParser(encoding=self.encoding) - - if is_url(current_doc): - with urlopen(current_doc) as f: - r = parse(f, parser=curr_parser) - elif is_xml and isinstance(current_doc, str): - r = XML(bytes(current_doc, encoding=self.encoding)) - elif is_xml and isinstance(current_doc, bytes): - r = XML(current_doc) - else: - r = parse(current_doc, parser=curr_parser) - except ( - LookupError, - URLError, - HTTPError, - OSError, - FileNotFoundError, - XMLSyntaxError, - ) as e: - raise e + curr_parser = XMLParser(encoding=self.encoding) + + if is_url(current_doc): + with urlopen(current_doc) as f: + r = parse(f, parser=curr_parser) + elif is_xml and isinstance(current_doc, str): + r = XML(bytes(current_doc, encoding=self.encoding)) + elif is_xml and isinstance(current_doc, bytes): + r = XML(current_doc) + else: + r = parse(current_doc, parser=curr_parser) return r From cb34dde4903639fdb56415e59bfa19f24cc4c1a3 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 20 Feb 2021 15:44:46 -0600 Subject: [PATCH 20/35] Replace path or buffer handling with get_handle and add compression and storage_options --- doc/source/user_guide/io.rst | 13 +- pandas/core/frame.py | 17 ++ pandas/io/formats/format.py | 17 ++ pandas/io/formats/xml.py | 176 +++++++++++--------- pandas/io/xml.py | 219 ++++++++++++++++--------- pandas/tests/io/data/xml/geom_xml.bz2 | Bin 0 -> 182 bytes pandas/tests/io/data/xml/geom_xml.gz | Bin 0 -> 166 bytes pandas/tests/io/data/xml/geom_xml.xz | Bin 0 -> 200 bytes pandas/tests/io/formats/test_to_xml.py | 120 ++++++++++++++ pandas/tests/io/test_xml.py | 156 ++++++++++++++---- 10 files changed, 526 insertions(+), 192 deletions(-) create mode 100644 pandas/tests/io/data/xml/geom_xml.bz2 create mode 100644 pandas/tests/io/data/xml/geom_xml.gz create mode 100644 pandas/tests/io/data/xml/geom_xml.xz diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 28f7b30974e1a..7fad2adeb6c14 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2925,6 +2925,17 @@ Read in the content of the "books.xml" as instance of ``StringIO`` or df = pd.read_xml(bio) df +Even read XML from AWS S3 buckets such as Python Software Foundation's IRS 990 Form: + +.. ipython:: python + + df = pd.read_xml( + "s3://irs-form-990/201923199349319487_public.xml", + xpath=".//irs:Form990PartVIISectionAGrp", + namespaces={"irs": "http://www.irs.gov/efile"} + ) + df + With `lxml`_ as default ``parser``, you access the full-featured XML library that extends Python's ElementTree API. One powerful tool is ability to query nodes selectively or conditionally with more expressive XPath: @@ -3254,7 +3265,7 @@ XML Final Notes * For above reason, if your application builds XML prior to pandas operations, use appropriate DOM libraries like ``etree`` and ``lxml`` to build the necessary document and not by string concatenation or regex adjustments. Always remember - XML is a *special* and not any text file. + XML is a *special* text file with markup rules. * With very large XML files (several hundred MBs to GBs), XPath and XSLT can become memory-intensive operations. Be sure to have enough available diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e257ac199d5a..fd6e1c2c24fae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2611,6 +2611,8 @@ def to_xml( pretty_print: Optional[bool] = True, parser: Optional[str] = "lxml", stylesheet: Optional[FilePathOrBuffer[str]] = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, ) -> Optional[str]: """ Render a DataFrame to an XML document. @@ -2668,6 +2670,19 @@ def to_xml( layout of elements and attributes from original output. This argument requires ``lxml`` to be installed. Only XSLT 1.0 scripts and not later versions is currently supported. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., if using a URL that will be + parsed by fsspec, e.g., starting “s3://”, “gcs://”. An error will be + raised if providing this argument with a non-fsspec URL. See the fsspec + and backend storage implementation docs for the set of allowed keys and + values. Returns ------- @@ -2764,6 +2779,8 @@ def to_xml( pretty_print=pretty_print, parser=parser, stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, ) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6a2852123dcba..1e2e1be8a40d2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1021,6 +1021,8 @@ def to_xml( pretty_print: Optional[bool] = True, parser: Optional[str] = "lxml", stylesheet: Optional[FilePathOrBuffer[str]] = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, ) -> Optional[str]: """ Render a DataFrame to an XML document. @@ -1078,6 +1080,19 @@ def to_xml( layout of elements and attributes from original output. This argument requires ``lxml`` to be installed. Only XSLT 1.0 scripts and not later versions is currently supported. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., if using a URL that will be + parsed by fsspec, e.g., starting “s3://”, “gcs://”. An error will be + raised if providing this argument with a non-fsspec URL. See the fsspec + and backend storage implementation docs for the set of allowed keys and + values. """ from pandas.io.formats.xml import EtreeXMLFormatter, LxmlXMLFormatter @@ -1115,6 +1130,8 @@ def to_xml( xml_declaration=xml_declaration, pretty_print=pretty_print, stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, ) return xml_formatter.write_output() diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index ea18ca851e8c7..8defe95ac6d8b 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -6,12 +6,12 @@ import io from typing import Any, Dict, List, Optional, Union -from pandas._typing import FilePathOrBuffer +from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import is_list_like -from pandas.io.common import is_url, urlopen +from pandas.io.common import get_handle, is_url, urlopen from pandas.io.formats.format import DataFrameFormatter @@ -62,6 +62,14 @@ class BaseXMLFormatter: stylesheet : str or file-like A URL, file, file-like object, or a raw string containing XSLT. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + Compression type for on-the-fly decompression of on-disk data. + If 'infer', then use extension for gzip, bz2, zip or xz. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., + See also -------- pandas.io.formats.xml.EtreeXMLFormatter @@ -85,6 +93,8 @@ def __init__( xml_declaration: Optional[bool] = True, pretty_print: Optional[bool] = True, stylesheet: Optional[FilePathOrBuffer[str]] = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, ) -> None: self.fmt = formatter self.path_or_buffer = path_or_buffer @@ -100,8 +110,10 @@ def __init__( self.xml_declaration = xml_declaration self.pretty_print = pretty_print self.stylesheet = stylesheet - self.frame = self.fmt.frame + self.compression = compression + self.storage_options = storage_options + self.frame = self.fmt.frame self.orig_cols = self.fmt.frame.columns.tolist() self.frame_dicts = self.process_dataframe() @@ -238,15 +250,23 @@ def build_elems(self) -> None: def write_output(self) -> Optional[str]: xml_doc = self.build_tree() - out_str: Optional[str] = xml_doc.decode(self.encoding).rstrip() - if self.path_or_buffer and isinstance(self.path_or_buffer, str): - with open(self.path_or_buffer, "wb") as f: - f.write(xml_doc) + out_str: Optional[str] - out_str = None + if self.path_or_buffer is not None: + # apply compression and byte/text conversion + with get_handle( + self.path_or_buffer, + "wb", + compression=self.compression, + storage_options=self.storage_options, + is_text=False, + ) as handles: + handles.handle.write(xml_doc) # type: ignore[arg-type] + return None - return out_str + else: + return xml_doc.decode(self.encoding).rstrip() class EtreeXMLFormatter(BaseXMLFormatter): @@ -277,6 +297,7 @@ def build_tree(self) -> bytes: if not self.attr_cols and not self.elem_cols: self.elem_cols = list(self.frame_dicts[0].keys()) self.build_elems() + else: self.build_attribs() self.build_elems() @@ -315,42 +336,46 @@ def get_prefix_uri(self) -> str: return uri def build_attribs(self) -> None: - if self.attr_cols: - for col in self.attr_cols: - flat_col = col - if isinstance(col, tuple): - flat_col = ( - "".join(str(c) for c in col).strip() - if "" in col - else "_".join(str(c) for c in col).strip() - ) - - attr_name = f"{self.prefix_uri}{flat_col}" - try: - if self.d[col] is not None: - self.elem_row.attrib[attr_name] = str(self.d[col]) - except KeyError: - raise KeyError(f"no valid column, {col}") + if not self.attr_cols: + return + + for col in self.attr_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + attr_name = f"{self.prefix_uri}{flat_col}" + try: + if self.d[col] is not None: + self.elem_row.attrib[attr_name] = str(self.d[col]) + except KeyError: + raise KeyError(f"no valid column, {col}") def build_elems(self) -> None: from xml.etree.ElementTree import SubElement - if self.elem_cols: - for col in self.elem_cols: - flat_col = col - if isinstance(col, tuple): - flat_col = ( - "".join(str(c) for c in col).strip() - if "" in col - else "_".join(str(c) for c in col).strip() - ) - - elem_name = f"{self.prefix_uri}{flat_col}" - try: - val = None if self.d[col] in [None, ""] else str(self.d[col]) - SubElement(self.elem_row, elem_name).text = val - except KeyError: - raise KeyError(f"no valid column, {col}") + if not self.elem_cols: + return + + for col in self.elem_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + elem_name = f"{self.prefix_uri}{flat_col}" + try: + val = None if self.d[col] in [None, ""] else str(self.d[col]) + SubElement(self.elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") def prettify_tree(self) -> bytes: """ @@ -410,6 +435,7 @@ def build_tree(self) -> bytes: if not self.attr_cols and not self.elem_cols: self.elem_cols = list(self.frame_dicts[0].keys()) self.build_elems() + else: self.build_attribs() self.build_elems() @@ -452,42 +478,46 @@ def get_prefix_uri(self) -> str: return uri def build_attribs(self) -> None: - if self.attr_cols: - for col in self.attr_cols: - flat_col = col - if isinstance(col, tuple): - flat_col = ( - "".join(str(c) for c in col).strip() - if "" in col - else "_".join(str(c) for c in col).strip() - ) - - attr_name = f"{self.prefix_uri}{flat_col}" - try: - if self.d[col] is not None: - self.elem_row.attrib[attr_name] = self.d[col] - except KeyError: - raise KeyError(f"no valid column, {col}") + if not self.attr_cols: + return + + for col in self.attr_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + attr_name = f"{self.prefix_uri}{flat_col}" + try: + if self.d[col] is not None: + self.elem_row.attrib[attr_name] = self.d[col] + except KeyError: + raise KeyError(f"no valid column, {col}") def build_elems(self) -> None: from lxml.etree import SubElement - if self.elem_cols: - for col in self.elem_cols: - flat_col = col - if isinstance(col, tuple): - flat_col = ( - "".join(str(c) for c in col).strip() - if "" in col - else "_".join(str(c) for c in col).strip() - ) - - elem_name = f"{self.prefix_uri}{flat_col}" - try: - val = None if self.d[col] in [None, ""] else str(self.d[col]) - SubElement(self.elem_row, elem_name).text = val - except KeyError: - raise KeyError(f"no valid column, {col}") + if not self.elem_cols: + return + + for col in self.elem_cols: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join(str(c) for c in col).strip() + if "" in col + else "_".join(str(c) for c in col).strip() + ) + + elem_name = f"{self.prefix_uri}{flat_col}" + try: + val = None if self.d[col] in [None, ""] else str(self.d[col]) + SubElement(self.elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") def convert_io(self) -> Union[bytes, str, None]: """ diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 9048de3fdb401..c1ee926775e9c 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,7 +5,7 @@ import io from typing import Dict, List, Optional, Union -from pandas._typing import FilePathOrBuffer +from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError, ParserError @@ -13,7 +13,13 @@ from pandas.core.frame import DataFrame -from pandas.io.common import is_url, stringify_path, urlopen +from pandas.io.common import ( + file_exists, + get_handle, + is_fsspec_url, + is_url, + stringify_path, +) from pandas.io.parsers import TextParser @@ -23,9 +29,9 @@ class _XMLFrameParser: Parameters ---------- - io : str or file-like - This can be either a string of raw XML, a valid URL, - file or file-like object. + path_or_buffer : a valid JSON str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. xpath : str or regex The XPath expression to parse required set of nodes for @@ -51,6 +57,14 @@ class _XMLFrameParser: URL, file, file-like object, or a raw string containing XSLT, `etree` does not support XSLT but retained for consistency. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + Compression type for on-the-fly decompression of on-disk data. + If 'infer', then use extension for gzip, bz2, zip or xz. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., + See also -------- pandas.io.xml._EtreeFrameParser @@ -72,7 +86,7 @@ class _XMLFrameParser: def __init__( self, - io, + path_or_buffer, xpath, namespaces, elems_only, @@ -80,8 +94,10 @@ def __init__( names, encoding, stylesheet, + compression, + storage_options, ): - self.io = io + self.path_or_buffer = path_or_buffer self.xpath = xpath self.namespaces = namespaces self.elems_only = elems_only @@ -90,6 +106,8 @@ def __init__( self.encoding = encoding self.stylesheet = stylesheet self.is_style = None + self.compression = compression + self.storage_options = storage_options def parse_data(self) -> List[Dict[str, Optional[str]]]: """ @@ -154,37 +172,54 @@ def _validate_names(self) -> None: """ raise AbstractMethodError(self) - def _convert_io(self, xml_data) -> Union[str, bytes, None]: + def _preprocess_data(self, data): """ - Convert io object to string. + Convert extracted raw data. - This method will convert io object into a string or keep - as string, depending on object type. + This method will return underlying data of extracted XML content. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is a string or bytes that is an XML document. """ + if hasattr(data, "read"): + data = data.read() - obj: Union[bytes, str, None] = None + if not hasattr(data, "read") and isinstance(data, str): + data = io.StringIO(data) - if isinstance(xml_data, str): - obj = xml_data + if not hasattr(data, "read") and isinstance(data, bytes): + data = io.BytesIO(data) - elif isinstance(xml_data, bytes): - obj = xml_data.decode(self.encoding) + return data - elif isinstance(xml_data, io.StringIO): - obj = xml_data.getvalue() - - elif isinstance(xml_data, io.BytesIO): - obj = xml_data.getvalue().decode(self.encoding) + def _get_data_from_filepath(self, filepath_or_buffer): + """ + Extract raw XML data. - elif isinstance(xml_data, io.TextIOWrapper): - obj = xml_data.read() + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. XML bytes - elif isinstance(xml_data, io.BufferedReader): - obj = xml_data.read().decode(self.encoding) - else: - obj = None + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + if ( + not isinstance(filepath_or_buffer, str) + or is_url(filepath_or_buffer) + or is_fsspec_url(filepath_or_buffer) + or file_exists(filepath_or_buffer) + ): + self.handles = get_handle( + filepath_or_buffer, + "r", + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) + filepath_or_buffer = self.handles.handle - return obj + return filepath_or_buffer def _parse_doc(self): """ @@ -371,25 +406,33 @@ def _validate_names(self) -> None: ) def _parse_doc(self) -> Union[Element, ElementTree]: - from xml.etree.ElementTree import Element, ElementTree, fromstring, parse - - current_doc = self._convert_io(self.io) - if current_doc: - if isinstance(current_doc, str): - is_xml = current_doc.startswith((" None: def _parse_doc(self): from lxml.etree import XML, XMLParser, parse - self.raw_doc = self.stylesheet if self.is_style else self.io + self.raw_doc = self.stylesheet if self.is_style else self.path_or_buffer - current_doc = self._convert_io(self.raw_doc) - if current_doc: - if isinstance(current_doc, str): - is_xml = current_doc.startswith((" DataFrame: """ @@ -654,7 +696,7 @@ def _parse( if parser == "lxml": if lxml is not None: p = _LxmlFrameParser( - io, + path_or_buffer, xpath, namespaces, elems_only, @@ -662,24 +704,15 @@ def _parse( names, encoding, stylesheet, + compression, + storage_options, ) else: raise ImportError("lxml not found, please install or use the etree parser.") - p = _EtreeFrameParser( - io, - xpath, - namespaces, - elems_only, - attrs_only, - names, - encoding, - stylesheet, - ) - elif parser == "etree": p = _EtreeFrameParser( - io, + path_or_buffer, xpath, namespaces, elems_only, @@ -687,6 +720,8 @@ def _parse( names, encoding, stylesheet, + compression, + storage_options, ) else: raise ValueError("Values for parser can only be lxml or etree.") @@ -697,7 +732,7 @@ def _parse( def read_xml( - io: FilePathOrBuffer, + path_or_buffer: FilePathOrBuffer, xpath: Optional[str] = "./*", namespaces: Optional[Union[dict, List[dict]]] = None, elems_only: Optional[bool] = False, @@ -706,6 +741,8 @@ def read_xml( encoding: Optional[str] = "utf-8", parser: Optional[str] = "lxml", stylesheet: Optional[FilePathOrBuffer[str]] = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, ) -> DataFrame: r""" Read XML document into a ``DataFrame`` object. @@ -714,8 +751,9 @@ def read_xml( Parameters ---------- - io : str, path object or file-like object - A URL, file-like object, or raw string containing XML. + path_or_buffer : str, path object, or file-like object + Any valid XML string or path is acceptable. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. xpath : str, optional, default './\*' The XPath to parse required set of nodes for migration to DataFrame. @@ -764,6 +802,21 @@ def read_xml( transformation and not the original XML document. Only XSLT 1.0 scripts and not later versions is currently supported. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., if using a URL that will be + parsed by fsspec, e.g., starting “s3://”, “gcs://”. An error will be + raised if providing this argument with a non-fsspec URL. See the fsspec + and backend storage implementation docs for the set of allowed keys and + values. + Returns ------- df @@ -877,10 +930,10 @@ def read_xml( 2 triangle 180 3.0 """ - io = stringify_path(io) + path_or_buffer = stringify_path(path_or_buffer) return _parse( - io=io, + path_or_buffer=path_or_buffer, xpath=xpath, namespaces=namespaces, elems_only=elems_only, @@ -889,4 +942,6 @@ def read_xml( encoding=encoding, parser=parser, stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, ) diff --git a/pandas/tests/io/data/xml/geom_xml.bz2 b/pandas/tests/io/data/xml/geom_xml.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..eef4b3597e6eb38b6c0b2c1ba23bcdd27336f60b GIT binary patch literal 182 zcmV;n07?HsT4*^jL0KkKSxE8zK>z?hS%5$gK!fc^ujl{3FaWs27)(tD2*fflm^38O zkTe=J01X6+B*HX#5Yr)`#6bjv4IxD&#YR?6PYyDd%fU|j91CQMi?`2`n(UxNy-=9E z=HGx&{|Fj kK$@grDs3x?pb?S)V8bt%JtGIW7GI0GBAh5lBzXTIfOhvx$^ZZW literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/xml/geom_xml.gz b/pandas/tests/io/data/xml/geom_xml.gz new file mode 100644 index 0000000000000000000000000000000000000000..8cd899f91024fdebe12aee22d6b7574c998833c0 GIT binary patch literal 166 zcmV;X09pSZiwFn}q%dFt|7T@yZC`k8Yyg#z!4AS85Jd0&3T~V%ZE8#plzz(wRwSvQ z3spZ~D)po_@w79U-IvMA@o}Em4Jo?d2j12VXXte>&f9^{>A36ps48h7fvOmj6z04%^yb^rhX literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/xml/geom_xml.xz b/pandas/tests/io/data/xml/geom_xml.xz new file mode 100644 index 0000000000000000000000000000000000000000..7ef61d2925b05c77b390ca432160a542e8e65bec GIT binary patch literal 200 zcmV;(05|{rH+ooF000E$*0e?f03iVu0001VFXf})0Ym_ZT>u^r%ZCxz&SsGhgC5HL zdo9`@l$!Nks(G>G~_q!H$g@O_5n)ug+ru2jawQ+!V5g~{~QxIWgJo-WG)vtUM ziz`pe{lmqHf$N|3oaLKM0JX-1QM|LUl8}?}eQ){EXNG??L8FjQ!mN&^unnSyvi1gb z2vzZ0XY>v!&=~lufy!lc@BjekzSlrkJ^XF}0i*%N0ssKdWUoH4#Ao{g000001X)^s Cj$QTu literal 0 HcmV?d00001 diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 5144d13401e73..f5392442c7dad 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -1274,3 +1274,123 @@ def test_style_to_json(): out_xml = geom_df.to_xml(stylesheet=xsl) assert out_json == out_xml + + +# COMPRESSION + +geom_xml = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +def test_bz2_output(parser): + import bz2 + + with tm.ensure_clean() as path: + geom_df.to_xml(path, compression="bz2") + + with bz2.BZ2File(path, "rb") as fp: + output = fp.read() + + # etree and lxml differs on quotes and case in xml declaration + output = output.decode("utf-8").replace( + ' @@ -211,18 +217,27 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) -def test_wrong_io_object(parser): - with pytest.raises(ValueError, match=("io is not a url, file, or xml string")): - read_xml(DataFrame, parser=parser) +@td.skip_if_no("lxml") +def test_wrong_file_path_lxml(datapath): + from lxml.etree import XMLSyntaxError + + with pytest.raises( + XMLSyntaxError, + match=("Start tag expected, '<' not found"), + ): + filename = os.path.join("data", "html", "books.xml") + read_xml(filename, parser="lxml") + +def test_wrong_file_path_etree(datapath): + from xml.etree.ElementTree import ParseError -def test_wrong_file_path(datapath, parser): with pytest.raises( - (OSError, FileNotFoundError), - match=("failed to load external entity|No such file or directory|没有那个文件或目录"), + ParseError, + match=("not well-formed"), ): filename = os.path.join("data", "html", "books.xml") - read_xml(filename, parser=parser) + read_xml(filename, parser="etree") @tm.network @@ -513,48 +528,35 @@ def test_names_option_wrong_type(datapath, parser): # ENCODING -@td.skip_if_no("lxml") -def test_wrong_encoding_lxml(datapath): - from lxml.etree import XMLSyntaxError - - filename = datapath("io", "data", "xml", "baby_names.xml") - with pytest.raises(XMLSyntaxError, match=("Input is not proper UTF-8")): - read_xml(filename) - - -@td.skip_if_no("lxml") -def test_utf16_encoding_lxml(datapath): - from lxml.etree import XMLSyntaxError - +def test_wrong_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") - with pytest.raises(XMLSyntaxError, match=("Start tag expected, '<' not found")): - read_xml(filename, encoding="UTF-16") + with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode")): + read_xml(filename, parser=parser) -@td.skip_if_no("lxml") -def test_unknown_encoding_lxml(datapath): +def test_utf16_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") - with pytest.raises(LookupError, match=("unknown encoding")): - read_xml(filename, encoding="UFT-8") + with pytest.raises(UnicodeError, match=("UTF-16 stream does not start with BOM")): + read_xml(filename, encoding="UTF-16", parser=parser) -# etree raises no error on wrong, utf-16, or unknown encoding -@pytest.mark.parametrize("encoding", [None, "UTF-16", "UFT-8"]) -def test_wrong_encoding_etree(datapath, encoding): +def test_unknown_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") - read_xml(filename, parser="etree", encoding=encoding) + with pytest.raises(LookupError, match=("unknown encoding: uft-8")): + read_xml(filename, encoding="UFT-8", parser=parser) def test_ascii_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") - read_xml(filename, encoding="ascii", parser=parser) + with pytest.raises(UnicodeDecodeError, match=("'ascii' codec can't decode byte")): + read_xml(filename, encoding="ascii", parser=parser) @td.skip_if_no("lxml") def test_parser_consistency_with_encoding(datapath): filename = datapath("io", "data", "xml", "baby_names.xml") df_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") - df_etree = read_xml(filename, parser="etree") + df_etree = read_xml(filename, parser="etree", encoding="iso-8859-1") tm.assert_frame_equal(df_lxml, df_etree) @@ -835,12 +837,14 @@ def test_incorrect_xsl_apply(datapath): @td.skip_if_no("lxml") def test_wrong_stylesheet(): + from lxml.etree import XMLSyntaxError + kml = os.path.join("data", "xml", "cta_rail_lines.kml") xsl = os.path.join("data", "xml", "flatten.xsl") with pytest.raises( - (OSError, FileNotFoundError), - match=("failed to load external entity|No such file or directory|没有那个文件或目录"), + XMLSyntaxError, + match=("Start tag expected, '<' not found"), ): read_xml(kml, stylesheet=xsl) @@ -888,3 +892,83 @@ def test_online_stylesheet(): ) tm.assert_frame_equal(df_expected, df_xsl) + + +# COMPRESSION + + +@pytest.mark.parametrize( + "compfile", ["geom_xml.bz2", "geom_xml.gzz", "geom_xml.xz", "geom_xml.zip"] +) +def test_compression_read(datapath, parser, compfile): + filename = datapath("io", "data", "xml", "geom_xml.bz2") + xml_df = read_xml(filename, parser=parser) + + tm.assert_frame_equal(xml_df, geom_df) + + +def test_wrong_compression_bz2(datapath, parser): + filename = datapath("io", "data", "xml", "geom_xml.zip") + + with pytest.raises(OSError, match="Invalid data stream"): + read_xml(filename, parser=parser, compression="bz2") + + +def test_wrong_compression_gz(datapath, parser): + filename = datapath("io", "data", "xml", "geom_xml.zip") + + with pytest.raises(OSError, match="Not a gzipped file"): + read_xml(filename, parser=parser, compression="gzip") + + +def test_wrong_compression_xz(datapath, parser): + from lzma import LZMAError + + filename = datapath("io", "data", "xml", "geom_xml.bz2") + + with pytest.raises(LZMAError, match="Input format not supported by decoder"): + read_xml(filename, parser=parser, compression="xz") + + +def test_wrong_compression_zip(datapath, parser): + from zipfile import BadZipFile + + filename = datapath("io", "data", "xml", "geom_xml.gz") + + with pytest.raises(BadZipFile, match="File is not a zip file"): + read_xml(filename, parser=parser, compression="zip") + + +def test_unsuported_compression(datapath, parser): + with pytest.raises(ValueError, match="Unrecognized compression type"): + with tm.ensure_clean() as path: + read_xml(path, compression="7z") + + +# STORAGE OPTIONS + + +@tm.network +@td.skip_if_no("s3fs") +@td.skip_if_no("lxml") +def test_s3_parser_consistency(): + # Python Software Foundation (2019 IRS-990 FORM) + s3 = "s3://irs-form-990/201923199349319487_public.xml" + + df_lxml = read_xml( + s3, + xpath=".//irs:Form990PartVIISectionAGrp", + namespaces={"irs": "http://www.irs.gov/efile"}, + parser="lxml", + storage_options={"anon": True}, + ) + + df_etree = read_xml( + s3, + xpath=".//irs:Form990PartVIISectionAGrp", + namespaces={"irs": "http://www.irs.gov/efile"}, + parser="etree", + storage_options={"anon": True}, + ) + + tm.assert_frame_equal(df_lxml, df_etree) From a7716b8b91d670a0b1c533d20d1795db906474d0 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 20 Feb 2021 20:43:14 -0600 Subject: [PATCH 21/35] Fix issues in tests from other Python envs --- pandas/io/formats/xml.py | 40 ++++++++-- pandas/io/xml.py | 29 +++++-- pandas/tests/io/formats/test_to_xml.py | 69 ++++++++-------- pandas/tests/io/test_xml.py | 105 +++++++++++++++---------- 4 files changed, 158 insertions(+), 85 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 8defe95ac6d8b..a15c0c6ce32ff 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -4,14 +4,28 @@ import codecs import io -from typing import Any, Dict, List, Optional, Union - -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from typing import ( + Any, + Dict, + List, + Optional, + Union, +) + +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import is_list_like -from pandas.io.common import get_handle, is_url, urlopen +from pandas.io.common import ( + get_handle, + is_url, + urlopen, +) from pandas.io.formats.format import DataFrameFormatter @@ -284,7 +298,11 @@ def __init__(self, *args, **kwargs): self.prefix_uri = self.get_prefix_uri() def build_tree(self) -> bytes: - from xml.etree.ElementTree import Element, SubElement, tostring + from xml.etree.ElementTree import ( + Element, + SubElement, + tostring, + ) self.root = Element( f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces() @@ -424,7 +442,11 @@ def build_tree(self) -> bytes: This method initializes the root and builds attributes and elements with optional namespaces. """ - from lxml.etree import Element, SubElement, tostring + from lxml.etree import ( + Element, + SubElement, + tostring, + ) self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces) @@ -576,7 +598,11 @@ def parse_doc(self): * If io object is not readable as string or file-like object. """ - from lxml.etree import XML, XMLParser, parse + from lxml.etree import ( + XML, + XMLParser, + parse, + ) current_doc = self.convert_io() if current_doc and isinstance(current_doc, str): diff --git a/pandas/io/xml.py b/pandas/io/xml.py index c1ee926775e9c..3355783a8aa1d 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -3,11 +3,23 @@ """ import io -from typing import Dict, List, Optional, Union +from typing import ( + Dict, + List, + Optional, + Union, +) -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) from pandas.compat._optional import import_optional_dependency -from pandas.errors import AbstractMethodError, ParserError +from pandas.errors import ( + AbstractMethodError, + ParserError, +) from pandas.core.dtypes.common import is_list_like @@ -255,7 +267,10 @@ class _EtreeFrameParser(_XMLFrameParser): standard library XML module: `xml.etree.ElementTree`. """ - from xml.etree.ElementTree import Element, ElementTree + from xml.etree.ElementTree import ( + Element, + ElementTree, + ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -613,7 +628,11 @@ def _validate_names(self) -> None: ) def _parse_doc(self): - from lxml.etree import XML, XMLParser, parse + from lxml.etree import ( + XML, + XMLParser, + parse, + ) self.raw_doc = self.stylesheet if self.is_style else self.path_or_buffer diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index f5392442c7dad..d978d26429542 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -1,4 +1,7 @@ -from io import BytesIO, StringIO +from io import ( + BytesIO, + StringIO, +) import os import sys @@ -15,35 +18,37 @@ """ CHECKLIST -[x] - ValueError("Values for parser can only be lxml or etree.") +[x] - ValueError: "Values for parser can only be lxml or etree." etree -[x] - ImportError("lxml not found, please install or use the etree parser.") -[X] - TypeError("...is not a valid type for attr_cols") -[X] - TypeError("...is not a valid type for elem_cols") -[X] - LookupError("unknown encoding") -[X] - KeyError("...is not included in namespaces") -[X] - KeyError("no valid column") -[X] - ValueError("To use stylesheet, you need lxml installed...") -[] - OSError (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) -[X] - FileNotFoundError("No such file or directory") +[x] - ImportError: "lxml not found, please install or use the etree parser." +[X] - TypeError: "...is not a valid type for attr_cols" +[X] - TypeError: "...is not a valid type for elem_cols" +[X] - LookupError: "unknown encoding" +[X] - KeyError: "...is not included in namespaces" +[X] - KeyError: "no valid column" +[X] - ValueError: "To use stylesheet, you need lxml installed..." +[] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) +[X] - FileNotFoundError: "No such file or directory" +[X] - PermissionError: "Forbidden" lxml -[X] - TypeError("...is not a valid type for attr_cols") -[X] - TypeError("...is not a valid type for elem_cols") -[X] - LookupError("unknown encoding") -[] - OSError (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) -[X] - FileNotFoundError("No such file or directory") -[X] - KeyError("...is not included in namespaces") -[X] - KeyError("no valid column") -[X] - ValueError("stylesheet is not a url, file, or xml string.") -[] - LookupError (NEED WRONG ENCODING FOR FILE OUTPUT) -[] - URLError (USUALLY DUE TO NETWORKING) -[] - HTTPError (NEED AN ONLINE STYLESHEET) -[X] - OSError("failed to load external entity") -[X] - XMLSyntaxError("Opening and ending tag mismatch") -[X] - XSLTApplyError("Cannot resolve URI") -[X] - XSLTParseError("failed to compile") +[X] - TypeError: "...is not a valid type for attr_cols" +[X] - TypeError: "...is not a valid type for elem_cols" +[X] - LookupError: "unknown encoding" +[] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) +[X] - FileNotFoundError: "No such file or directory" +[X] - KeyError: "...is not included in namespaces" +[X] - KeyError: "no valid column" +[X] - ValueError: "stylesheet is not a url, file, or xml string." +[] - LookupError: (NEED WRONG ENCODING FOR FILE OUTPUT) +[] - URLError: (USUALLY DUE TO NETWORKING) +[] - HTTPError: (NEED AN ONLINE STYLESHEET) +[X] - OSError: "failed to load external entity" +[X] - XMLSyntaxError: "Opening and ending tag mismatch" +[X] - XSLTApplyError: "Cannot resolve URI" +[X] - XSLTParseError: "failed to compile" +[X] - PermissionError: "Forbidden" """ geom_df = DataFrame( @@ -1306,7 +1311,7 @@ def test_bz2_output(parser): import bz2 with tm.ensure_clean() as path: - geom_df.to_xml(path, compression="bz2") + geom_df.to_xml(path, parser=parser, compression="bz2") with bz2.BZ2File(path, "rb") as fp: output = fp.read() @@ -1324,7 +1329,7 @@ def test_gz_output(parser): import gzip with tm.ensure_clean() as path: - geom_df.to_xml(path, compression="gzip") + geom_df.to_xml(path, parser=parser, compression="gzip") with gzip.open(path, "rb") as fp: output = fp.read() @@ -1342,7 +1347,7 @@ def test_xz_output(parser): import lzma with tm.ensure_clean() as path: - geom_df.to_xml(path, compression="xz") + geom_df.to_xml(path, parser=parser, compression="xz") with lzma.open(path, "rb") as fp: output = fp.read() @@ -1360,7 +1365,7 @@ def test_zip_output(parser): import zipfile with tm.ensure_clean() as path: - geom_df.to_xml(path, compression="zip") + geom_df.to_xml(path, parser=parser, compression="zip") with zipfile.ZipFile(path, "r") as fp: output = fp.read(fp.infolist()[0]) @@ -1377,7 +1382,7 @@ def test_zip_output(parser): def test_unsuported_compression(datapath, parser): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: - geom_df.to_xml(path, compression="7z") + geom_df.to_xml(path, parser=parser, compression="7z") # STORAGE OPTIONS @@ -1393,4 +1398,4 @@ def test_s3_permission_output(parser): fs = s3fs.S3FileSystem(anon=True) fs.ls("pandas-test") - geom_df.to_xml("s3://pandas-test/geom.xml", compression="zip") + geom_df.to_xml("s3://pandas-test/geom.xml", compression="zip", parser=parser) diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 6f1f7579616e0..927999efbe6ae 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -1,4 +1,7 @@ -from io import BytesIO, StringIO +from io import ( + BytesIO, + StringIO, +) import os from urllib.error import HTTPError @@ -15,39 +18,53 @@ """ CHECK LIST -[x] - ValueError("Values for parser can only be lxml or etree.") +[x] - ValueError: "Values for parser can only be lxml or etree." etree -[x] - ImportError("lxml not found, please install or use the etree parser.") -[X] - ValueError("Either element or attributes can be parsed not both.") -[X] - ValueError("xpath does not return any nodes...") -[X] - SyntaxError("You have used an incorrect or unsupported XPath") -[X] - ValueError("names does not match length of child elements in xpath.") -[X] - TypeError("...is not a valid type for names") -[X] - ValueError("To use stylesheet, you need lxml installed...") -[] - URLError (GENERAL ERROR WITH HTTPError AS SUBCLASS) -[X] - HTTPError("HTTP Error 404: Not Found") -[] - OSError (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) -[X] - FileNotFoundError("No such file or directory") +[x] - ImportError: "lxml not found, please install or use the etree parser." +[X] - ValueError: "Either element or attributes can be parsed not both." +[X] - ValueError: "xpath does not return any nodes..." +[X] - SyntaxError: "You have used an incorrect or unsupported XPath" +[X] - ValueError: "names does not match length of child elements in xpath." +[X] - TypeError: "...is not a valid type for names" +[X] - ValueError: "To use stylesheet, you need lxml installed..." +[] - URLError: (GENERAL ERROR WITH HTTPError AS SUBCLASS) +[X] - HTTPError: "HTTP Error 404: Not Found" +[] - OSError: (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) +[X] - FileNotFoundError: "No such file or directory" [] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) +[X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..." +[X] - UnicodeError: "UTF-16 stream does not start with BOM" +[X] - BadZipFile: "File is not a zip file" +[X] - OSError: "Invalid data stream" +[X] - LZMAError: "Input format not supported by decoder" +[X] - ValueError: "Unrecognized compression type" +[X] - PermissionError: "Forbidden" lxml -[X] - ValueError("Either element or attributes can be parsed not both.") -[X] - XSLTApplyError("Cannot resolve URI") -[X] - XSLTParseError("document is not a stylesheet") -[X] - ValueError("xpath does not return any nodes.") -[X] - XPathEvalError("Invalid expression") -[] - XPathSyntaxError (OLD VERSION IN lxml FOR XPATH ERRORS) -[X] - TypeError("empty namespace prefix is not supported in XPath") -[X] - ValueError("names does not match length of child elements in xpath.") -[X] - TypeError("...is not a valid type for names") -[X] - LookupError(unknown encoding) -[] - URLError (USUALLY DUE TO NETWORKING) -[X - HTTPError("HTTP Error 404: Not Found") -[X] - OSError("failed to load external entity") -[X] - XMLSyntaxError("Start tag expected, '<' not found") -[] - ParserError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) -[X] - ValueError("Values for parser can only be lxml or etree.") +[X] - ValueError: "Either element or attributes can be parsed not both." +[X] - XSLTApplyError: "Cannot resolve URI" +[X] - XSLTParseError: "document is not a stylesheet" +[X] - ValueError: "xpath does not return any nodes." +[X] - XPathEvalError: "Invalid expression" +[] - XPathSyntaxError: (OLD VERSION IN lxml FOR XPATH ERRORS) +[X] - TypeError: "empty namespace prefix is not supported in XPath" +[X] - ValueError: "names does not match length of child elements in xpath." +[X] - TypeError: "...is not a valid type for names" +[X] - LookupError: "unknown encoding" +[] - URLError: (USUALLY DUE TO NETWORKING) +[X - HTTPError: "HTTP Error 404: Not Found" +[X] - OSError: "failed to load external entity" +[X] - XMLSyntaxError: "Start tag expected, '<' not found" +[] - ParserError: (FAILSAFE CATCH ALL FOR VERY COMPLEX XML +[X] - ValueError: "Values for parser can only be lxml or etree." +[X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..." +[X] - UnicodeError: "UTF-16 stream does not start with BOM" +[X] - BadZipFile: "File is not a zip file" +[X] - OSError: "Invalid data stream" +[X] - LZMAError: "Input format not supported by decoder" +[X] - ValueError: "Unrecognized compression type" +[X] - PermissionError: "Forbidden" """ geom_df = DataFrame( @@ -536,7 +553,13 @@ def test_wrong_encoding(datapath, parser): def test_utf16_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") - with pytest.raises(UnicodeError, match=("UTF-16 stream does not start with BOM")): + with pytest.raises( + UnicodeError, + match=( + "UTF-16 stream does not start with BOM|" + "'utf-16-le' codec can't decode byte" + ), + ): read_xml(filename, encoding="UTF-16", parser=parser) @@ -897,9 +920,7 @@ def test_online_stylesheet(): # COMPRESSION -@pytest.mark.parametrize( - "compfile", ["geom_xml.bz2", "geom_xml.gzz", "geom_xml.xz", "geom_xml.zip"] -) +@pytest.mark.parametrize("compfile", ["geom_xml.bz2", "geom_xml.gz", "geom_xml.xz"]) def test_compression_read(datapath, parser, compfile): filename = datapath("io", "data", "xml", "geom_xml.bz2") xml_df = read_xml(filename, parser=parser) @@ -908,17 +929,19 @@ def test_compression_read(datapath, parser, compfile): def test_wrong_compression_bz2(datapath, parser): - filename = datapath("io", "data", "xml", "geom_xml.zip") + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression="zip") - with pytest.raises(OSError, match="Invalid data stream"): - read_xml(filename, parser=parser, compression="bz2") + with pytest.raises(OSError, match="Invalid data stream"): + read_xml(path, parser=parser, compression="bz2") def test_wrong_compression_gz(datapath, parser): - filename = datapath("io", "data", "xml", "geom_xml.zip") + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression="zip") - with pytest.raises(OSError, match="Not a gzipped file"): - read_xml(filename, parser=parser, compression="gzip") + with pytest.raises(OSError, match="Not a gzipped file"): + read_xml(path, parser=parser, compression="gzip") def test_wrong_compression_xz(datapath, parser): @@ -942,7 +965,7 @@ def test_wrong_compression_zip(datapath, parser): def test_unsuported_compression(datapath, parser): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: - read_xml(path, compression="7z") + read_xml(path, parser=parser, compression="7z") # STORAGE OPTIONS @@ -952,7 +975,7 @@ def test_unsuported_compression(datapath, parser): @td.skip_if_no("s3fs") @td.skip_if_no("lxml") def test_s3_parser_consistency(): - # Python Software Foundation (2019 IRS-990 FORM) + # Python Software Foundation (2019 IRS-990 RETURN) s3 = "s3://irs-form-990/201923199349319487_public.xml" df_lxml = read_xml( From 5b93c1629e98560acf3edd598896596dce862f9d Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 20 Feb 2021 23:42:30 -0600 Subject: [PATCH 22/35] Fix precommit issue with import line --- pandas/io/formats/format.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index cf4da7757f75c..b08b67ab5b288 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1125,7 +1125,10 @@ def to_xml( values. """ - from pandas.io.formats.xml import EtreeXMLFormatter, LxmlXMLFormatter + from pandas.io.formats.xml import ( + EtreeXMLFormatter, + LxmlXMLFormatter, + ) lxml = import_optional_dependency("lxml.etree", errors="ignore") From 9a0dfb4ff89a762b455a6b4dcfe140ba32e7354e Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 21 Feb 2021 14:41:57 -0600 Subject: [PATCH 23/35] Adjust code and tests per twoertwein comments --- pandas/core/frame.py | 23 ++--- pandas/io/formats/format.py | 19 ++-- pandas/io/formats/xml.py | 136 +++++++++++++------------ pandas/io/xml.py | 89 ++++++---------- pandas/tests/io/data/xml/geom_xml.bz2 | Bin 182 -> 0 bytes pandas/tests/io/data/xml/geom_xml.gz | Bin 166 -> 0 bytes pandas/tests/io/data/xml/geom_xml.xz | Bin 200 -> 0 bytes pandas/tests/io/formats/test_to_xml.py | 83 ++++++--------- pandas/tests/io/test_xml.py | 40 +++++--- 9 files changed, 170 insertions(+), 220 deletions(-) delete mode 100644 pandas/tests/io/data/xml/geom_xml.bz2 delete mode 100644 pandas/tests/io/data/xml/geom_xml.gz delete mode 100644 pandas/tests/io/data/xml/geom_xml.xz diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 99b6cba132f8f..a33e7c5c49879 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2639,9 +2639,10 @@ def to_html( render_links=render_links, ) + @doc(storage_options=generic._shared_docs["storage_options"]) def to_xml( self, - path_or_buffer: Optional[FilePathOrBuffer[str]] = None, + path_or_buffer: Optional[FilePathOrBuffer] = None, index: bool = True, root_name: Optional[str] = "data", row_name: Optional[str] = "row", @@ -2654,7 +2655,7 @@ def to_xml( xml_declaration: Optional[bool] = True, pretty_print: Optional[bool] = True, parser: Optional[str] = "lxml", - stylesheet: Optional[FilePathOrBuffer[str]] = None, + stylesheet: Optional[FilePathOrBuffer] = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> Optional[str]: @@ -2691,7 +2692,7 @@ def to_xml( Default namespaces should be given empty string key. For example, :: - namespaces = {"": "https://example.com"} + namespaces = {{"": "https://example.com"}} prefix : str, optional Namespace prefix to be used for every element and/or attribute @@ -2704,7 +2705,7 @@ def to_xml( pretty_print : bool, default True Whether output should be pretty printed with indentation and line breaks. - parser : {'lxml','etree'}, default 'lxml' + parser : {{'lxml','etree'}}, default 'lxml' Parser module to use for building of tree. Only 'lxml' and 'etree' are supported. With 'lxml', the ability to use XSLT stylesheet is supported. @@ -2720,13 +2721,7 @@ def to_xml( '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc., if using a URL that will be - parsed by fsspec, e.g., starting “s3://”, “gcs://”. An error will be - raised if providing this argument with a non-fsspec URL. See the fsspec - and backend storage implementation docs for the set of allowed keys and - values. + {storage_options} Returns ------- @@ -2741,9 +2736,9 @@ def to_xml( Examples -------- - >>> df = pd.DataFrame({'shape': ['square', 'circle', 'triangle'], + >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], ... 'degrees': [360, 360, 180], - ... 'sides': [4, np.nan, 3]}) + ... 'sides': [4, np.nan, 3]}}) >>> df.to_xml() # doctest: +SKIP @@ -2778,7 +2773,7 @@ def to_xml( - >>> df.to_xml(namespaces={"doc": "https://example.com"}, + >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, ... prefix="doc") # doctest: +SKIP diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b08b67ab5b288..44428abdcd8a5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -62,6 +62,7 @@ StorageOptions, ) from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -97,6 +98,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.reshape.concat import concat +from pandas.core.shared_docs import _shared_docs from pandas.io.common import stringify_path from pandas.io.formats.printing import ( @@ -1035,9 +1037,10 @@ def to_html( string = html_formatter.to_string() return save_to_buffer(string, buf=buf, encoding=encoding) + @doc(storage_options=_shared_docs["storage_options"]) def to_xml( self, - path_or_buffer: Optional[FilePathOrBuffer[str]] = None, + path_or_buffer: Optional[FilePathOrBuffer] = None, index: Optional[bool] = True, root_name: Optional[str] = "data", row_name: Optional[str] = "row", @@ -1050,7 +1053,7 @@ def to_xml( xml_declaration: Optional[bool] = True, pretty_print: Optional[bool] = True, parser: Optional[str] = "lxml", - stylesheet: Optional[FilePathOrBuffer[str]] = None, + stylesheet: Optional[FilePathOrBuffer] = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> Optional[str]: @@ -1087,7 +1090,7 @@ def to_xml( Default namespaces should be given empty string key. For example, :: - namespaces = {'': 'https://example.com'} + namespaces = {{'': 'https://example.com'}} prefix : str, optional Namespace prefix to be used for every element and/or attribute @@ -1100,7 +1103,7 @@ def to_xml( pretty_print : bool, default True Whether output should be pretty printed with indentation and line breaks. - parser : {'lxml','etree'}, default "lxml" + parser : {{'lxml','etree'}}, default "lxml" Parser module to use for building of tree. Only 'lxml' and 'etree' are supported. With 'lxml', the ability to use XSLT stylesheet is supported. @@ -1116,13 +1119,7 @@ def to_xml( '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc., if using a URL that will be - parsed by fsspec, e.g., starting “s3://”, “gcs://”. An error will be - raised if providing this argument with a non-fsspec URL. See the fsspec - and backend storage implementation docs for the set of allowed keys and - values. + {storage_options} """ from pandas.io.formats.xml import ( diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index a15c0c6ce32ff..49c18b344e1e9 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -22,9 +22,11 @@ from pandas.core.dtypes.common import is_list_like from pandas.io.common import ( + file_exists, get_handle, + is_fsspec_url, is_url, - urlopen, + stringify_path, ) from pandas.io.formats.format import DataFrameFormatter @@ -76,7 +78,7 @@ class BaseXMLFormatter: stylesheet : str or file-like A URL, file, file-like object, or a raw string containing XSLT. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' Compression type for on-the-fly decompression of on-disk data. If 'infer', then use extension for gzip, bz2, zip or xz. @@ -94,7 +96,7 @@ class BaseXMLFormatter: def __init__( self, formatter: DataFrameFormatter, - path_or_buffer: Optional[FilePathOrBuffer[str]] = None, + path_or_buffer: Optional[FilePathOrBuffer] = None, index: Optional[bool] = True, root_name: Optional[str] = "data", row_name: Optional[str] = "row", @@ -106,7 +108,7 @@ def __init__( encoding: str = "utf-8", xml_declaration: Optional[bool] = True, pretty_print: Optional[bool] = True, - stylesheet: Optional[FilePathOrBuffer[str]] = None, + stylesheet: Optional[FilePathOrBuffer] = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> None: @@ -262,6 +264,56 @@ def build_elems(self) -> None: raise AbstractMethodError(self) + def _preprocess_data(self, data): + """ + Convert extracted raw data. + + This method will return underlying data of extracted XML content. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is a string or bytes that is an XML document. + """ + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + + def _get_data_from_filepath(self, filepath_or_buffer): + """ + Extract raw XML data. + + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. XML bytes + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + if ( + not isinstance(filepath_or_buffer, str) + or is_url(filepath_or_buffer) + or is_fsspec_url(filepath_or_buffer) + or file_exists(filepath_or_buffer) + ): + with get_handle( + filepath_or_buffer, + "r", + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) as handle_obj: + filepath_or_buffer = ( + handle_obj.handle.read() + if hasattr(handle_obj.handle, "read") + else handle_obj.handle + ) + + return filepath_or_buffer + def write_output(self) -> Optional[str]: xml_doc = self.build_tree() @@ -541,61 +593,12 @@ def build_elems(self) -> None: except KeyError: raise KeyError(f"no valid column, {col}") - def convert_io(self) -> Union[bytes, str, None]: - """ - Convert stylesheet object to string. - - This method will convert stylesheet object into a string or keep - as string, depending on object type. - """ - - obj: Union[bytes, str, None] = None - - if isinstance(self.stylesheet, str): - obj = self.stylesheet - - elif isinstance(self.stylesheet, bytes): - obj = self.stylesheet.decode(self.encoding) - - elif isinstance(self.stylesheet, io.StringIO): - obj = self.stylesheet.getvalue() - - elif isinstance(self.stylesheet, io.BytesIO): - obj = self.stylesheet.getvalue().decode(self.encoding) - - elif isinstance(self.stylesheet, io.TextIOWrapper): - obj = self.stylesheet.read() - - elif isinstance(self.stylesheet, io.BufferedReader): - obj = self.stylesheet.read().decode(self.encoding) - else: - obj = None - - return obj - def parse_doc(self): """ Build tree from stylesheet. This method will parse stylesheet object into tree for parsing conditionally by its specific object type. - - Raises - ------ - HttpError - * If URL cannot be reached. - - LookupError - * If xml document has incorrect or unknown encoding. - - OSError - * If file cannot be found. - - XMLSyntaxError - * If xml document conntains syntax issues. - - ValueError - * If io object is not readable as string or file-like object. """ from lxml.etree import ( @@ -604,21 +607,24 @@ def parse_doc(self): parse, ) - current_doc = self.convert_io() - if current_doc and isinstance(current_doc, str): - is_xml = current_doc.startswith((" None: def _parse_doc(self) -> Union[Element, ElementTree]: from xml.etree.ElementTree import ( - Element, - ElementTree, XMLParser, - fromstring, parse, ) - if isinstance(self.path_or_buffer, str): - if self.path_or_buffer.startswith((" DataFrame: class that build Data Frame and infers specific dtypes. """ - tags = [list(d.keys()) for d in data] + tags = next(iter(data)) nodes = [list(d.values()) for d in data] try: - with TextParser(nodes, names=tags[0], **kwargs) as tp: + with TextParser(nodes, names=tags, **kwargs) as tp: return tp.read() except ParserError: raise ParserError( @@ -750,6 +724,7 @@ def _parse( return _data_to_frame(data=data_dicts, **kwargs) +@doc(storage_options=_shared_docs["storage_options"]) def read_xml( path_or_buffer: FilePathOrBuffer, xpath: Optional[str] = "./*", @@ -759,7 +734,7 @@ def read_xml( names: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", parser: Optional[str] = "lxml", - stylesheet: Optional[FilePathOrBuffer[str]] = None, + stylesheet: Optional[FilePathOrBuffer] = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> DataFrame: @@ -790,7 +765,7 @@ def read_xml( namespace prefix such as 'doc' to the URI in order to parse underlying nodes and/or attributes. For example, :: - namespaces = {"doc": "https://example.com"} + namespaces = {{"doc": "https://example.com"}} elems_only : bool, optional, default False Parse only the child elements at the specified ``xpath``. By default, @@ -807,7 +782,7 @@ def read_xml( encoding : str, optional, default 'utf-8' Encoding of XML document. - parser : {'lxml','etree'}, default 'lxml' + parser : {{'lxml','etree'}}, default 'lxml' Parser module to use for retrieval of data. Only 'lxml' and 'etree' are supported. With 'lxml' more complex XPath searches and ability to use XSLT stylesheet are supported. @@ -828,13 +803,7 @@ def read_xml( otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc., if using a URL that will be - parsed by fsspec, e.g., starting “s3://”, “gcs://”. An error will be - raised if providing this argument with a non-fsspec URL. See the fsspec - and backend storage implementation docs for the set of allowed keys and - values. + {storage_options} Returns ------- @@ -941,7 +910,7 @@ def read_xml( >>> df = pd.read_xml(xml, ... xpath="//doc:row", - ... namespaces={"doc": "https://example.com"}) + ... namespaces={{"doc": "https://example.com"}}) >>> df shape degrees sides 0 square 360 4.0 diff --git a/pandas/tests/io/data/xml/geom_xml.bz2 b/pandas/tests/io/data/xml/geom_xml.bz2 deleted file mode 100644 index eef4b3597e6eb38b6c0b2c1ba23bcdd27336f60b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 182 zcmV;n07?HsT4*^jL0KkKSxE8zK>z?hS%5$gK!fc^ujl{3FaWs27)(tD2*fflm^38O zkTe=J01X6+B*HX#5Yr)`#6bjv4IxD&#YR?6PYyDd%fU|j91CQMi?`2`n(UxNy-=9E z=HGx&{|Fj kK$@grDs3x?pb?S)V8bt%JtGIW7GI0GBAh5lBzXTIfOhvx$^ZZW diff --git a/pandas/tests/io/data/xml/geom_xml.gz b/pandas/tests/io/data/xml/geom_xml.gz deleted file mode 100644 index 8cd899f91024fdebe12aee22d6b7574c998833c0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 166 zcmV;X09pSZiwFn}q%dFt|7T@yZC`k8Yyg#z!4AS85Jd0&3T~V%ZE8#plzz(wRwSvQ z3spZ~D)po_@w79U-IvMA@o}Em4Jo?d2j12VXXte>&f9^{>A36ps48h7fvOmj6z04%^yb^rhX diff --git a/pandas/tests/io/data/xml/geom_xml.xz b/pandas/tests/io/data/xml/geom_xml.xz deleted file mode 100644 index 7ef61d2925b05c77b390ca432160a542e8e65bec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 200 zcmV;(05|{rH+ooF000E$*0e?f03iVu0001VFXf})0Ym_ZT>u^r%ZCxz&SsGhgC5HL zdo9`@l$!Nks(G>G~_q!H$g@O_5n)ug+ru2jawQ+!V5g~{~QxIWgJo-WG)vtUM ziz`pe{lmqHf$N|3oaLKM0JX-1QM|LUl8}?}eQ){EXNG??L8FjQ!mN&^unnSyvi1gb z2vzZ0XY>v!&=~lufy!lc@BjekzSlrkJ^XF}0i*%N0ssKdWUoH4#Ao{g000001X)^s Cj$QTu diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index d978d26429542..4fb1d0c2cf638 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -13,6 +13,7 @@ from pandas import DataFrame import pandas._testing as tm +from pandas.io.common import get_handle from pandas.io.xml import read_xml """ @@ -1062,11 +1063,13 @@ def test_stylesheet_buffered_reader(datapath, mode): @td.skip_if_no("lxml") def test_stylesheet_wrong_path(datapath): + from lxml.etree import XMLSyntaxError + xsl = os.path.join("data", "xml", "row_field_output.xslt") with pytest.raises( - (OSError, FileNotFoundError), - match=("failed to load external entity|No such file or directory|没有那个文件或目录"), + (XMLSyntaxError), + match=("Start tag expected, '<' not found"), ): geom_df.to_xml(stylesheet=xsl) @@ -1074,7 +1077,7 @@ def test_stylesheet_wrong_path(datapath): @td.skip_if_no("lxml") def test_stylesheet_not_path_buffer(): with pytest.raises( - ValueError, match=("stylesheet is not a url, file, or xml string") + TypeError, match=("argument of type 'function' is not iterable") ): geom_df.to_xml(stylesheet=DataFrame) @@ -1283,6 +1286,7 @@ def test_style_to_json(): # COMPRESSION + geom_xml = """\ @@ -1307,53 +1311,20 @@ def test_style_to_json(): """ -def test_bz2_output(parser): - import bz2 - - with tm.ensure_clean() as path: - geom_df.to_xml(path, parser=parser, compression="bz2") - - with bz2.BZ2File(path, "rb") as fp: - output = fp.read() - - # etree and lxml differs on quotes and case in xml declaration - output = output.decode("utf-8").replace( - ' Date: Mon, 22 Feb 2021 20:12:24 -0600 Subject: [PATCH 24/35] Remove redundancy and object names in XML parse and rename tests for clarity --- pandas/io/formats/xml.py | 52 +++++++++++---------- pandas/io/xml.py | 62 ++++++++++++-------------- pandas/tests/io/formats/test_to_xml.py | 2 +- 3 files changed, 55 insertions(+), 61 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 49c18b344e1e9..bcb6b41feb31a 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -264,22 +264,6 @@ def build_elems(self) -> None: raise AbstractMethodError(self) - def _preprocess_data(self, data): - """ - Convert extracted raw data. - - This method will return underlying data of extracted XML content. - The data either has a `read` attribute (e.g. a file object or a - StringIO/BytesIO) or is a string or bytes that is an XML document. - """ - if isinstance(data, str): - data = io.StringIO(data) - - elif isinstance(data, bytes): - data = io.BytesIO(data) - - return data - def _get_data_from_filepath(self, filepath_or_buffer): """ Extract raw XML data. @@ -314,6 +298,22 @@ def _get_data_from_filepath(self, filepath_or_buffer): return filepath_or_buffer + def _preprocess_data(self, data): + """ + Convert extracted raw data. + + This method will return underlying data of extracted XML content. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is a string or bytes that is an XML document. + """ + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + def write_output(self) -> Optional[str]: xml_doc = self.build_tree() @@ -602,8 +602,8 @@ def parse_doc(self): """ from lxml.etree import ( - XML, XMLParser, + fromstring, parse, ) @@ -612,19 +612,17 @@ def parse_doc(self): if isinstance(style_doc, str) and style_doc.startswith((" None: """ raise AbstractMethodError(self) - def _preprocess_data(self, data): - """ - Convert extracted raw data. - - This method will return underlying data of extracted XML content. - The data either has a `read` attribute (e.g. a file object or a - StringIO/BytesIO) or is a string or bytes that is an XML document. - """ - - if isinstance(data, str): - data = io.StringIO(data) - - elif isinstance(data, bytes): - data = io.BytesIO(data) - - return data - def _get_data_from_filepath(self, filepath_or_buffer): """ Extract raw XML data. @@ -237,6 +220,23 @@ def _get_data_from_filepath(self, filepath_or_buffer): return filepath_or_buffer + def _preprocess_data(self, data): + """ + Convert extracted raw data. + + This method will return underlying data of extracted XML content. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is a string or bytes that is an XML document. + """ + + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + def _parse_doc(self): """ Build tree from io. @@ -418,11 +418,11 @@ def _parse_doc(self) -> Union[Element, ElementTree]: ): self.path_or_buffer = self.path_or_buffer.encode(self.encoding) - data = self._get_data_from_filepath(self.path_or_buffer) - self.data = self._preprocess_data(data) + handle_data = self._get_data_from_filepath(self.path_or_buffer) + self.xml_data = self._preprocess_data(handle_data) curr_parser = XMLParser(encoding=self.encoding) - r = parse(self.data, parser=curr_parser) + r = parse(self.xml_data, parser=curr_parser) return r @@ -604,8 +604,8 @@ def _validate_names(self) -> None: def _parse_doc(self): from lxml.etree import ( - XML, XMLParser, + fromstring, parse, ) @@ -614,19 +614,17 @@ def _parse_doc(self): if isinstance(raw_doc, str) and raw_doc.startswith((" Date: Mon, 22 Feb 2021 20:27:13 -0600 Subject: [PATCH 25/35] Add XML table in install.rst --- doc/source/getting_started/install.rst | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 943847f6552ef..a9c3d637a41e3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -273,16 +273,6 @@ Computation Dependency Minimum Version Notes ========================= ================== ============================================================= SciPy 1.12.0 Miscellaneous statistical functions -<<<<<<< HEAD -xlsxwriter 1.0.2 Excel writing -blosc 1.17.0 Compression for HDF5 -fsspec 0.7.4 Handling files aside from local and HTTP -fastparquet 0.4.0 Parquet reading / writing -gcsfs 0.6.0 Google Cloud Storage access -html5lib 1.0.1 HTML parser for read_html (see :ref:`note `) -lxml 4.3.0 | HTML parser for read_html (see :ref:`note `) - | XML parser for read_xml and tree builder for to_xml -matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations (see :ref:`Enhancing Performance `) xarray 0.12.3 pandas-like API for N-dimensional data @@ -336,6 +326,15 @@ top-level :func:`~pandas.read_html` function: .. _lxml: https://lxml.de .. _tabulate: https://github.com/astanin/python-tabulate +XML +^^^ + +========================= ================== ============================================================= +Dependency Minimum Version Notes +========================= ================== ============================================================= +lxml 4.3.0 XML parser for read_xml and tree builder for to_xml +========================= ================== ============================================================= + SQL databases ^^^^^^^^^^^^^ From 9b2163669397c2b06314bfd33f9935936bef17d4 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 23 Feb 2021 12:52:34 -0600 Subject: [PATCH 26/35] Streamline filepath_or_buffer handling and add TypeError tests --- pandas/io/formats/xml.py | 9 ++--- pandas/io/xml.py | 17 ++++----- pandas/tests/io/formats/test_to_xml.py | 4 +-- pandas/tests/io/test_xml.py | 50 +++++++++++++++++++++++--- 4 files changed, 57 insertions(+), 23 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index bcb6b41feb31a..a0f0c1f1a8141 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -277,7 +277,11 @@ def _get_data_from_filepath(self, filepath_or_buffer): It returns input types (2) and (3) unchanged. """ filepath_or_buffer = stringify_path(filepath_or_buffer) + if ( + isinstance(filepath_or_buffer, str) + and not filepath_or_buffer.startswith((" None: def _validate_names(self) -> None: if self.names: parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) - if parent: - children = parent.findall("*") + children = parent.findall("*") if parent else [] if is_list_like(self.names): if len(self.names) < len(children): @@ -413,11 +416,6 @@ def _parse_doc(self) -> Union[Element, ElementTree]: parse, ) - if isinstance(self.path_or_buffer, str) and self.path_or_buffer.startswith( - (" Date: Tue, 23 Feb 2021 14:16:26 -0600 Subject: [PATCH 27/35] Fix lxml test on few Python envs --- pandas/io/formats/xml.py | 2 +- pandas/io/xml.py | 2 +- pandas/tests/io/formats/test_to_xml.py | 7 ++++++- pandas/tests/io/test_xml.py | 7 ++++++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index a0f0c1f1a8141..1cd9af30763cc 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -271,7 +271,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): The method accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) - 3. XML bytes + 3. XML string or bytes This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 52d01d0213e74..b45b80d715d89 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -193,7 +193,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): The method accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) - 3. XML bytes + 3. XML string or bytes This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 6d90b24137155..0428def340b3d 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -1076,7 +1076,12 @@ def test_stylesheet_wrong_path(datapath): @td.skip_if_no("lxml") def test_stylesheet_not_path_buffer(): - with pytest.raises(TypeError, match=("cannot parse from 'type'")): + from lxml.etree import XMLSyntaxError + + with pytest.raises( + (TypeError, XMLSyntaxError), + match=("cannot parse from 'type'|Start tag expected, '<' not found"), + ): geom_df.to_xml(stylesheet=DataFrame) diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index d8b21d3f909a7..d04c3a6664943 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -906,9 +906,14 @@ def test_wrong_stylesheet(): @td.skip_if_no("lxml") def test_stylesheet_not_path_buffer(): + from lxml.etree import XMLSyntaxError + kml = os.path.join("data", "xml", "cta_rail_lines.kml") - with pytest.raises(TypeError, match=("cannot parse from 'type'")): + with pytest.raises( + (TypeError, XMLSyntaxError), + match=("cannot parse from 'type'|Start tag expected, '<' not found"), + ): read_xml(kml, stylesheet=DataFrame) From ce986bcb636ceccae7eb936006be610fee67f9e6 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 23 Feb 2021 21:22:50 -0600 Subject: [PATCH 28/35] Adjust io handling in context maanger --- pandas/io/formats/xml.py | 16 +++++------ pandas/io/xml.py | 16 +++++------ pandas/tests/io/formats/test_to_xml.py | 15 +++++----- pandas/tests/io/test_xml.py | 40 ++++++++++++++++---------- 4 files changed, 49 insertions(+), 38 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 1cd9af30763cc..b5108ebc5d176 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -614,16 +614,16 @@ def parse_doc(self): style_doc = self.stylesheet handle_data = self._get_data_from_filepath(style_doc) - xml_data = self._preprocess_data(handle_data) - curr_parser = XMLParser(encoding=self.encoding) + with self._preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) - if isinstance(xml_data, io.StringIO): - r = fromstring( - xml_data.getvalue().encode(self.encoding), parser=curr_parser - ) - else: - r = parse(xml_data, parser=curr_parser) + if isinstance(xml_data, io.StringIO): + r = fromstring( + xml_data.getvalue().encode(self.encoding), parser=curr_parser + ) + else: + r = parse(xml_data, parser=curr_parser) return r diff --git a/pandas/io/xml.py b/pandas/io/xml.py index b45b80d715d89..1a62c9411942e 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -610,16 +610,16 @@ def _parse_doc(self): raw_doc = self.stylesheet if self.is_style else self.path_or_buffer handle_data = self._get_data_from_filepath(raw_doc) - xml_data = self._preprocess_data(handle_data) - curr_parser = XMLParser(encoding=self.encoding) + with self._preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) - if isinstance(xml_data, io.StringIO): - r = fromstring( - xml_data.getvalue().encode(self.encoding), parser=curr_parser - ) - else: - r = parse(xml_data, parser=curr_parser) + if isinstance(xml_data, io.StringIO): + r = fromstring( + xml_data.getvalue().encode(self.encoding), parser=curr_parser + ) + else: + r = parse(xml_data, parser=curr_parser) return r diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 0428def340b3d..e94dbb0a436a1 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -1068,7 +1068,7 @@ def test_stylesheet_wrong_path(datapath): xsl = os.path.join("data", "xml", "row_field_output.xslt") with pytest.raises( - (XMLSyntaxError), + XMLSyntaxError, match=("Start tag expected, '<' not found"), ): geom_df.to_xml(stylesheet=xsl) @@ -1076,15 +1076,16 @@ def test_stylesheet_wrong_path(datapath): @td.skip_if_no("lxml") def test_stylesheet_not_path_buffer(): - from lxml.etree import XMLSyntaxError - - with pytest.raises( - (TypeError, XMLSyntaxError), - match=("cannot parse from 'type'|Start tag expected, '<' not found"), - ): + with pytest.raises(AttributeError, match=("__enter__")): geom_df.to_xml(stylesheet=DataFrame) +@td.skip_if_no("lxml") +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_string_stylesheet(val): + geom_df.to_xml(stylesheet=val) + + @td.skip_if_no("lxml") def test_incorrect_xsl_syntax(): from lxml.etree import XMLSyntaxError diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index d04c3a6664943..b2426826ccc2e 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -45,8 +45,7 @@ lxml [X] - ValueError: "Either element or attributes can be parsed not both." -[X] - TypeError: "cannot parse from 'NoneType'" -[X] - TypeError: "cannot parse from 'type'" +[X] - AttributeError: "__enter__" [X] - XSLTApplyError: "Cannot resolve URI" [X] - XSLTParseError: "document is not a stylesheet" [X] - ValueError: "xpath does not return any nodes." @@ -238,6 +237,22 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_string_lxml(val): + from lxml.etree import XMLSyntaxError + + with pytest.raises(XMLSyntaxError, match="Document is empty"): + read_xml(val, parser="lxml") + + +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_string_etree(val): + from xml.etree.ElementTree import ParseError + + with pytest.raises(ParseError, match="no element found"): + read_xml(val, parser="etree") + + @td.skip_if_no("lxml") def test_wrong_file_path_lxml(): from lxml.etree import XMLSyntaxError @@ -264,12 +279,12 @@ def test_wrong_file_path_etree(): @td.skip_if_no("lxml") -def test_none_path_buffer_lxml(parser): - with pytest.raises(TypeError, match=("cannot parse from 'NoneType'")): +def test_none_path_buffer_lxml(): + with pytest.raises(AttributeError, match=("__enter__")): read_xml(None, parser="lxml") -def test_none_path_buffer_etree(parser): +def test_none_path_buffer_etree(): with pytest.raises( TypeError, match=("expected str, bytes or os.PathLike object, not NoneType") ): @@ -277,12 +292,12 @@ def test_none_path_buffer_etree(parser): @td.skip_if_no("lxml") -def test_not_path_buffer_lxml(parser): - with pytest.raises(TypeError, match=("cannot parse from 'type'")): +def test_not_path_buffer_lxml(): + with pytest.raises(AttributeError, match=("__enter__")): read_xml(DataFrame, parser="lxml") -def test_not_path_buffer_etree(parser): +def test_not_path_buffer_etree(): with pytest.raises( TypeError, match=("expected str, bytes or os.PathLike object, not type") ): @@ -906,15 +921,10 @@ def test_wrong_stylesheet(): @td.skip_if_no("lxml") def test_stylesheet_not_path_buffer(): - from lxml.etree import XMLSyntaxError - kml = os.path.join("data", "xml", "cta_rail_lines.kml") - with pytest.raises( - (TypeError, XMLSyntaxError), - match=("cannot parse from 'type'|Start tag expected, '<' not found"), - ): - read_xml(kml, stylesheet=DataFrame) + with pytest.raises(AttributeError, match=("__enter__")): + read_xml(kml, stylesheet={"a": 1}) def test_stylesheet_with_etree(datapath): From e2f80db6f15fea9e91b41c4681a8c9ba449c8d21 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 23 Feb 2021 23:14:27 -0600 Subject: [PATCH 29/35] Add and fix tests for special filepath_or_buffer values --- pandas/tests/io/formats/test_to_xml.py | 6 --- pandas/tests/io/test_xml.py | 65 +++++++++++++++++--------- 2 files changed, 44 insertions(+), 27 deletions(-) diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index e94dbb0a436a1..78640a35e578c 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -1074,12 +1074,6 @@ def test_stylesheet_wrong_path(datapath): geom_df.to_xml(stylesheet=xsl) -@td.skip_if_no("lxml") -def test_stylesheet_not_path_buffer(): - with pytest.raises(AttributeError, match=("__enter__")): - geom_df.to_xml(stylesheet=DataFrame) - - @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_stylesheet(val): diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index b2426826ccc2e..55879ffe703e7 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -23,7 +23,6 @@ etree [X] - ImportError: "lxml not found, please install or use the etree parser." [X] - TypeError: "expected str, bytes or os.PathLike object, not NoneType" -[X] - TypeError: "expected str, bytes or os.PathLike object, not type" [X] - ValueError: "Either element or attributes can be parsed not both." [X] - ValueError: "xpath does not return any nodes..." [X] - SyntaxError: "You have used an incorrect or unsupported XPath" @@ -237,6 +236,28 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) +@td.skip_if_no("lxml") +def test_closed_file_lxml(datapath): + xml = datapath("io", "data", "xml", "baby_names.xml") + + with open(xml, "rb") as f: + f.read() + + with pytest.raises(ValueError, match="I/O operation on closed file"): + read_xml(f, parser="lxml") + + +def test_closed_file_etree(datapath): + xml = datapath("io", "data", "xml", "baby_names.xml") + + with open(xml, "rb") as f: + f.read() + + with pytest.raises(ValueError, match="read of closed file"): + read_xml(f, parser="etree") + + +@td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_lxml(val): from lxml.etree import XMLSyntaxError @@ -279,29 +300,18 @@ def test_wrong_file_path_etree(): @td.skip_if_no("lxml") -def test_none_path_buffer_lxml(): - with pytest.raises(AttributeError, match=("__enter__")): - read_xml(None, parser="lxml") +def test_none_file_path_lxml(): + xml_var = None + with pytest.raises(AttributeError, match="__enter__"): + read_xml(xml_var, parser="lxml") -def test_none_path_buffer_etree(): +def test_none_file_path_etree(): + xml_var = None with pytest.raises( - TypeError, match=("expected str, bytes or os.PathLike object, not NoneType") + TypeError, match="expected str, bytes or os.PathLike object, not NoneType" ): - read_xml(None, parser="etree") - - -@td.skip_if_no("lxml") -def test_not_path_buffer_lxml(): - with pytest.raises(AttributeError, match=("__enter__")): - read_xml(DataFrame, parser="lxml") - - -def test_not_path_buffer_etree(): - with pytest.raises( - TypeError, match=("expected str, bytes or os.PathLike object, not type") - ): - read_xml(DataFrame, parser="etree") + read_xml(xml_var, parser="etree") @tm.network @@ -921,9 +931,14 @@ def test_wrong_stylesheet(): @td.skip_if_no("lxml") def test_stylesheet_not_path_buffer(): + from lxml.etree import XMLSyntaxError + kml = os.path.join("data", "xml", "cta_rail_lines.kml") - with pytest.raises(AttributeError, match=("__enter__")): + with pytest.raises( + (AttributeError, XMLSyntaxError), + match=("__enter__|Start tag expected, '<' not found"), + ): read_xml(kml, stylesheet={"a": 1}) @@ -937,6 +952,14 @@ def test_stylesheet_with_etree(datapath): read_xml(kml, parser="etree", stylesheet=xsl) +@td.skip_if_no("lxml") +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_stylesheet(val): + kml = os.path.join("data", "xml", "cta_rail_lines.kml") + + read_xml(kml, parser="etree", stylesheet=val) + + @tm.network @td.skip_if_no("lxml") def test_online_stylesheet(): From c7e1e118a85090516336f2b0222e074881019e1c Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 24 Feb 2021 07:30:50 -0600 Subject: [PATCH 30/35] Fix tests for better example and wrong parser --- pandas/tests/io/test_xml.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 55879ffe703e7..6247223e99ad0 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -301,17 +301,21 @@ def test_wrong_file_path_etree(): @td.skip_if_no("lxml") def test_none_file_path_lxml(): - xml_var = None - with pytest.raises(AttributeError, match="__enter__"): - read_xml(xml_var, parser="lxml") + with tm.ensure_clean("test.xml") as path: + xml_var = geom_df.to_xml(path) + + with pytest.raises(AttributeError, match="__enter__"): + read_xml(xml_var, parser="lxml") def test_none_file_path_etree(): - xml_var = None - with pytest.raises( - TypeError, match="expected str, bytes or os.PathLike object, not NoneType" - ): - read_xml(xml_var, parser="etree") + with tm.ensure_clean("test.xml") as path: + xml_var = geom_df.to_xml(path) + + with pytest.raises( + TypeError, match="expected str, bytes or os.PathLike object, not NoneType" + ): + read_xml(xml_var, parser="etree") @tm.network @@ -957,7 +961,7 @@ def test_stylesheet_with_etree(datapath): def test_empty_stylesheet(val): kml = os.path.join("data", "xml", "cta_rail_lines.kml") - read_xml(kml, parser="etree", stylesheet=val) + read_xml(kml, stylesheet=val) @tm.network From df9ecf4a7d892c375021e11f708f1ca2ea2a2a51 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 24 Feb 2021 09:09:37 -0600 Subject: [PATCH 31/35] Adjust to handle empty string stylesheet with tests --- pandas/io/formats/xml.py | 4 ++-- pandas/io/xml.py | 4 ++-- pandas/tests/io/formats/test_to_xml.py | 7 ++++++- pandas/tests/io/test_xml.py | 11 ++++++++--- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index b5108ebc5d176..d0037dc6cd703 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -384,7 +384,7 @@ def build_tree(self) -> bytes: if not self.xml_declaration: self.out_xml = self.remove_declaration() - if self.stylesheet: + if self.stylesheet is not None: raise ValueError( "To use stylesheet, you need lxml installed and selected as parser." ) @@ -526,7 +526,7 @@ def build_tree(self) -> bytes: xml_declaration=self.xml_declaration, ) - if self.stylesheet: + if self.stylesheet is not None: self.out_xml = self.transform_doc() return self.out_xml diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 1a62c9411942e..311bf39e3ebad 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -268,7 +268,7 @@ def __init__(self, *args, **kwargs): def parse_data(self) -> List[Dict[str, Optional[str]]]: - if self.stylesheet: + if self.stylesheet is not None: raise ValueError( "To use stylesheet, you need lxml installed and selected as parser." ) @@ -446,7 +446,7 @@ def parse_data(self) -> List[Dict[str, Optional[str]]]: self.xml_doc = self._parse_doc() - if self.stylesheet: + if self.stylesheet is not None: self.is_style = True self.xsl_doc = self._parse_doc() self.xml_doc = self._transform_doc() diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/formats/test_to_xml.py index 78640a35e578c..3b915a9664210 100644 --- a/pandas/tests/io/formats/test_to_xml.py +++ b/pandas/tests/io/formats/test_to_xml.py @@ -1077,7 +1077,12 @@ def test_stylesheet_wrong_path(datapath): @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_stylesheet(val): - geom_df.to_xml(stylesheet=val) + from lxml.etree import XMLSyntaxError + + with pytest.raises( + XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found") + ): + geom_df.to_xml(stylesheet=val) @td.skip_if_no("lxml") diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/test_xml.py index 6247223e99ad0..33954d0951cde 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/test_xml.py @@ -302,7 +302,7 @@ def test_wrong_file_path_etree(): @td.skip_if_no("lxml") def test_none_file_path_lxml(): with tm.ensure_clean("test.xml") as path: - xml_var = geom_df.to_xml(path) + xml_var = geom_df.to_xml(path, parser="lxml") with pytest.raises(AttributeError, match="__enter__"): read_xml(xml_var, parser="lxml") @@ -310,7 +310,7 @@ def test_none_file_path_lxml(): def test_none_file_path_etree(): with tm.ensure_clean("test.xml") as path: - xml_var = geom_df.to_xml(path) + xml_var = geom_df.to_xml(path, parser="etree") with pytest.raises( TypeError, match="expected str, bytes or os.PathLike object, not NoneType" @@ -959,9 +959,14 @@ def test_stylesheet_with_etree(datapath): @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_stylesheet(val): + from lxml.etree import XMLSyntaxError + kml = os.path.join("data", "xml", "cta_rail_lines.kml") - read_xml(kml, stylesheet=val) + with pytest.raises( + XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found") + ): + read_xml(kml, stylesheet=val) @tm.network From 5d75d51f01ff15389e78516819d7803482ca3df4 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 25 Feb 2021 00:47:02 -0600 Subject: [PATCH 32/35] Move methods out of class, adjust xpath check, and data frame formatting --- pandas/io/formats/xml.py | 170 ++++++----- pandas/io/xml.py | 163 +++++----- .../tests/io/{formats => xml}/test_to_xml.py | 0 pandas/tests/io/{ => xml}/test_xml.py | 283 ++++++++---------- 4 files changed, 322 insertions(+), 294 deletions(-) rename pandas/tests/io/{formats => xml}/test_to_xml.py (100%) rename pandas/tests/io/{ => xml}/test_xml.py (95%) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index d0037dc6cd703..fd03dcd342089 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -13,6 +13,7 @@ ) from pandas._typing import ( + Buffer, CompressionOptions, FilePathOrBuffer, StorageOptions, @@ -182,16 +183,16 @@ def process_dataframe(self) -> Dict[Union[int, str], Dict[str, Any]]: Adjust Data Frame to fit xml output. This method will adjust underlying data frame for xml output, - including replacing missing entities and including indexes. + including optionally replacing missing values and including indexes. """ - na_dict = {"None": self.na_rep, "NaN": self.na_rep, "nan": self.na_rep} + df = self.fmt.frame - df = ( - (self.fmt.frame.reset_index().applymap(str).replace(na_dict)) - if self.index - else self.fmt.frame.applymap(str).replace(na_dict) - ) + if self.index: + df = df.reset_index() + + if self.na_rep: + df = df.replace({None: self.na_rep, float("nan"): self.na_rep}) return df.to_dict(orient="index") @@ -264,67 +265,12 @@ def build_elems(self) -> None: raise AbstractMethodError(self) - def _get_data_from_filepath(self, filepath_or_buffer): - """ - Extract raw XML data. - - The method accepts three input types: - 1. filepath (string-like) - 2. file-like object (e.g. open file object, StringIO) - 3. XML string or bytes - - This method turns (1) into (2) to simplify the rest of the processing. - It returns input types (2) and (3) unchanged. - """ - filepath_or_buffer = stringify_path(filepath_or_buffer) - - if ( - isinstance(filepath_or_buffer, str) - and not filepath_or_buffer.startswith((" Optional[str]: xml_doc = self.build_tree() out_str: Optional[str] if self.path_or_buffer is not None: - # apply compression and byte/text conversion with get_handle( self.path_or_buffer, "wb", @@ -424,8 +370,13 @@ def build_attribs(self) -> None: attr_name = f"{self.prefix_uri}{flat_col}" try: - if self.d[col] is not None: - self.elem_row.attrib[attr_name] = str(self.d[col]) + val = ( + None + if self.d[col] is None or self.d[col] != self.d[col] + else str(self.d[col]) + ) + if val is not None: + self.elem_row.attrib[attr_name] = val except KeyError: raise KeyError(f"no valid column, {col}") @@ -446,7 +397,11 @@ def build_elems(self) -> None: elem_name = f"{self.prefix_uri}{flat_col}" try: - val = None if self.d[col] in [None, ""] else str(self.d[col]) + val = ( + None + if self.d[col] in [None, ""] or self.d[col] != self.d[col] + else str(self.d[col]) + ) SubElement(self.elem_row, elem_name).text = val except KeyError: raise KeyError(f"no valid column, {col}") @@ -570,8 +525,13 @@ def build_attribs(self) -> None: attr_name = f"{self.prefix_uri}{flat_col}" try: - if self.d[col] is not None: - self.elem_row.attrib[attr_name] = self.d[col] + val = ( + None + if self.d[col] is None or self.d[col] != self.d[col] + else str(self.d[col]) + ) + if val is not None: + self.elem_row.attrib[attr_name] = val except KeyError: raise KeyError(f"no valid column, {col}") @@ -592,7 +552,11 @@ def build_elems(self) -> None: elem_name = f"{self.prefix_uri}{flat_col}" try: - val = None if self.d[col] in [None, ""] else str(self.d[col]) + val = ( + None + if self.d[col] in [None, ""] or self.d[col] != self.d[col] + else str(self.d[col]) + ) SubElement(self.elem_row, elem_name).text = val except KeyError: raise KeyError(f"no valid column, {col}") @@ -613,9 +577,14 @@ def parse_doc(self): style_doc = self.stylesheet - handle_data = self._get_data_from_filepath(style_doc) + handle_data = _get_data_from_filepath( + filepath_or_buffer=style_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) - with self._preprocess_data(handle_data) as xml_data: + with _preprocess_data(handle_data) as xml_data: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): @@ -642,3 +611,64 @@ def transform_doc(self) -> bytes: new_doc = transformer(self.root) return bytes(new_doc) + + +def _get_data_from_filepath( + filepath_or_buffer, + encoding, + compression, + storage_options, +) -> Union[str, bytes, Buffer]: + """ + Extract raw XML data. + + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. XML string or bytes + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + + if ( + isinstance(filepath_or_buffer, str) + and not filepath_or_buffer.startswith((" Union[io.StringIO, io.BytesIO]: + """ + Convert extracted raw data. + + This method will return underlying data of extracted XML content. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is a string or bytes that is an XML document. + """ + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 311bf39e3ebad..122057c2c625d 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -11,6 +11,7 @@ ) from pandas._typing import ( + Buffer, CompressionOptions, FilePathOrBuffer, StorageOptions, @@ -186,61 +187,6 @@ def _validate_names(self) -> None: """ raise AbstractMethodError(self) - def _get_data_from_filepath(self, filepath_or_buffer): - """ - Extract raw XML data. - - The method accepts three input types: - 1. filepath (string-like) - 2. file-like object (e.g. open file object, StringIO) - 3. XML string or bytes - - This method turns (1) into (2) to simplify the rest of the processing. - It returns input types (2) and (3) unchanged. - """ - filepath_or_buffer = stringify_path(filepath_or_buffer) - - if ( - isinstance(filepath_or_buffer, str) - and not filepath_or_buffer.startswith((" Union[Element, ElementTree]: parse, ) - handle_data = self._get_data_from_filepath(self.path_or_buffer) - self.xml_data = self._preprocess_data(handle_data) + handle_data = _get_data_from_filepath( + filepath_or_buffer=self.path_or_buffer, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) - curr_parser = XMLParser(encoding=self.encoding) - r = parse(self.xml_data, parser=curr_parser) + with _preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) + r = parse(xml_data, parser=curr_parser) return r @@ -558,20 +509,23 @@ def _transform_doc(self): def _validate_path(self) -> None: + msg = ( + "xpath does not return any nodes. " + "Be sure row level nodes are in xpath. " + "If document uses namespaces denoted with " + "xmlns, be sure to define namespaces and " + "use them in xpath." + ) + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces) attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces) - if (elems == [] and attrs == [] and children == []) or ( - elems != [] and attrs == [] and children == [] - ): - raise ValueError( - "xpath does not return any nodes. " - "Be sure row level nodes are in xpath. " - "If document uses namespaces denoted with " - "xmlns, be sure to define namespaces and " - "use them in xpath." - ) + if elems == []: + raise ValueError(msg) + + if elems != [] and attrs == [] and children == []: + raise ValueError(msg) def _validate_names(self) -> None: """ @@ -609,9 +563,14 @@ def _parse_doc(self): raw_doc = self.stylesheet if self.is_style else self.path_or_buffer - handle_data = self._get_data_from_filepath(raw_doc) + handle_data = _get_data_from_filepath( + filepath_or_buffer=raw_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) - with self._preprocess_data(handle_data) as xml_data: + with _preprocess_data(handle_data) as xml_data: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): @@ -624,6 +583,68 @@ def _parse_doc(self): return r +def _get_data_from_filepath( + filepath_or_buffer, + encoding, + compression, + storage_options, +) -> Union[str, bytes, Buffer]: + """ + Extract raw XML data. + + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. XML string or bytes + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + + if ( + isinstance(filepath_or_buffer, str) + and not filepath_or_buffer.startswith((" Union[io.StringIO, io.BytesIO]: + """ + Convert extracted raw data. + + This method will return underlying data of extracted XML content. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is a string or bytes that is an XML document. + """ + + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + + def _data_to_frame(data, **kwargs) -> DataFrame: """ Convert parsed data to Data Frame. diff --git a/pandas/tests/io/formats/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py similarity index 100% rename from pandas/tests/io/formats/test_to_xml.py rename to pandas/tests/io/xml/test_to_xml.py diff --git a/pandas/tests/io/test_xml.py b/pandas/tests/io/xml/test_xml.py similarity index 95% rename from pandas/tests/io/test_xml.py rename to pandas/tests/io/xml/test_xml.py index 33954d0951cde..4cf5618e852e5 100644 --- a/pandas/tests/io/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -118,6 +118,115 @@ """ +df_kml = DataFrame( + { + "id": { + 0: "ID_00001", + 1: "ID_00002", + 2: "ID_00003", + 3: "ID_00004", + 4: "ID_00005", + }, + "name": { + 0: "Blue Line (Forest Park)", + 1: "Red, Purple Line", + 2: "Red, Purple Line", + 3: "Red, Purple Line", + 4: "Red, Purple Line", + }, + "styleUrl": { + 0: "#LineStyle01", + 1: "#LineStyle01", + 2: "#LineStyle01", + 3: "#LineStyle01", + 4: "#LineStyle01", + }, + "extrude": {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}, + "altitudeMode": { + 0: "clampedToGround", + 1: "clampedToGround", + 2: "clampedToGround", + 3: "clampedToGround", + 4: "clampedToGround", + }, + "coordinates": { + 0: ( + "-87.77678526964958,41.8708863930319,0 " + "-87.77826234150609,41.87097820122218,0 " + "-87.78251583439344,41.87130129991005,0 " + "-87.78418294588424,41.87145055520308,0 " + "-87.7872369165933,41.8717239119163,0 " + "-87.79160214925886,41.87210797280065,0" + ), + 1: ( + "-87.65758750947528,41.96427269188822,0 " + "-87.65802133507393,41.96581929055245,0 " + "-87.65819033925305,41.96621846093642,0 " + "-87.6583189819129,41.96650362897086,0 " + "-87.65835858701473,41.96669002089185,0 " + "-87.65838428411853,41.96688150295095,0 " + "-87.65842208882658,41.96745896091846,0 " + "-87.65846556843937,41.9683761425439,0 " + "-87.65849296214573,41.96913893870342,0" + ), + 2: ( + "-87.65492939166126,41.95377494531437,0 " + "-87.65557043199591,41.95376544118533,0 " + "-87.65606302030132,41.95376391658746,0 " + "-87.65623502146268,41.95377379126367,0 " + "-87.65634748981634,41.95380103566435,0 " + "-87.65646537904269,41.95387703994676,0 " + "-87.65656532461145,41.95396622645799,0 " + "-87.65664760856414,41.95404201996044,0 " + "-87.65671750555913,41.95416647054043,0 " + "-87.65673983607117,41.95429949810849,0 " + "-87.65673866475777,41.95441024240925,0 " + "-87.6567690255541,41.95490657227902,0 " + "-87.65683672482363,41.95692259283837,0 " + "-87.6568900886376,41.95861070983142,0 " + "-87.65699865558875,41.96181418669004,0 " + "-87.65756347177603,41.96397045777844,0 " + "-87.65758750947528,41.96427269188822,0" + ), + 3: ( + "-87.65362593118043,41.94742799535678,0 " + "-87.65363554415794,41.94819886386848,0 " + "-87.6536456393239,41.95059994675451,0 " + "-87.65365831235026,41.95108288489359,0 " + "-87.6536604873874,41.9519954657554,0 " + "-87.65362592053201,41.95245597302328,0 " + "-87.65367158496069,41.95311153649393,0 " + "-87.65368468595476,41.9533202828916,0 " + "-87.65369271253692,41.95343095587119,0 " + "-87.65373335834569,41.95351536301472,0 " + "-87.65378605844126,41.95358212680591,0 " + "-87.65385067928185,41.95364452823767,0 " + "-87.6539390793817,41.95370263886964,0 " + "-87.6540786298351,41.95373403675265,0 " + "-87.65430648647626,41.9537535411832,0 " + "-87.65492939166126,41.95377494531437,0" + ), + 4: ( + "-87.65345391792157,41.94217681262115,0 " + "-87.65342448305786,41.94237224420864,0 " + "-87.65339745703922,41.94268217746244,0 " + "-87.65337753982941,41.94288140770284,0 " + "-87.65336256753105,41.94317369618263,0 " + "-87.65338799707138,41.94357253961736,0 " + "-87.65340240886648,41.94389158188269,0 " + "-87.65341837392448,41.94406444407721,0 " + "-87.65342275247338,41.94421065714904,0 " + "-87.65347469646018,41.94434829382345,0 " + "-87.65351486483024,41.94447699917548,0 " + "-87.65353483605053,41.9453896864472,0 " + "-87.65361975532807,41.94689193720703,0 " + "-87.65362593118043,41.94742799535678,0" + ), + }, + } +) + + @pytest.fixture(params=["rb", "r"]) def mode(request): return request.param @@ -236,25 +345,13 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) -@td.skip_if_no("lxml") -def test_closed_file_lxml(datapath): - xml = datapath("io", "data", "xml", "baby_names.xml") - - with open(xml, "rb") as f: - f.read() - - with pytest.raises(ValueError, match="I/O operation on closed file"): - read_xml(f, parser="lxml") - +def test_file_handle_close(datapath, parser): + xml_file = datapath("io", "data", "xml", "books.xml") -def test_closed_file_etree(datapath): - xml = datapath("io", "data", "xml", "baby_names.xml") + with open(xml_file, "rb") as f: + read_xml(f.read(), parser=parser) - with open(xml, "rb") as f: - f.read() - - with pytest.raises(ValueError, match="read of closed file"): - read_xml(f, parser="etree") + assert not f.closed @td.skip_if_no("lxml") @@ -299,25 +396,6 @@ def test_wrong_file_path_etree(): read_xml(filename, parser="etree") -@td.skip_if_no("lxml") -def test_none_file_path_lxml(): - with tm.ensure_clean("test.xml") as path: - xml_var = geom_df.to_xml(path, parser="lxml") - - with pytest.raises(AttributeError, match="__enter__"): - read_xml(xml_var, parser="lxml") - - -def test_none_file_path_etree(): - with tm.ensure_clean("test.xml") as path: - xml_var = geom_df.to_xml(path, parser="etree") - - with pytest.raises( - TypeError, match="expected str, bytes or os.PathLike object, not NoneType" - ): - read_xml(xml_var, parser="etree") - - @tm.network @td.skip_if_no("lxml") def test_url(): @@ -675,113 +753,6 @@ def test_stylesheet_file(datapath): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") - data = { - "id": { - 0: "ID_00001", - 1: "ID_00002", - 2: "ID_00003", - 3: "ID_00004", - 4: "ID_00005", - }, - "name": { - 0: "Blue Line (Forest Park)", - 1: "Red, Purple Line", - 2: "Red, Purple Line", - 3: "Red, Purple Line", - 4: "Red, Purple Line", - }, - "styleUrl": { - 0: "#LineStyle01", - 1: "#LineStyle01", - 2: "#LineStyle01", - 3: "#LineStyle01", - 4: "#LineStyle01", - }, - "extrude": {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}, - "altitudeMode": { - 0: "clampedToGround", - 1: "clampedToGround", - 2: "clampedToGround", - 3: "clampedToGround", - 4: "clampedToGround", - }, - "coordinates": { - 0: ( - "-87.77678526964958,41.8708863930319,0 " - "-87.77826234150609,41.87097820122218,0 " - "-87.78251583439344,41.87130129991005,0 " - "-87.78418294588424,41.87145055520308,0 " - "-87.7872369165933,41.8717239119163,0 " - "-87.79160214925886,41.87210797280065,0" - ), - 1: ( - "-87.65758750947528,41.96427269188822,0 " - "-87.65802133507393,41.96581929055245,0 " - "-87.65819033925305,41.96621846093642,0 " - "-87.6583189819129,41.96650362897086,0 " - "-87.65835858701473,41.96669002089185,0 " - "-87.65838428411853,41.96688150295095,0 " - "-87.65842208882658,41.96745896091846,0 " - "-87.65846556843937,41.9683761425439,0 " - "-87.65849296214573,41.96913893870342,0" - ), - 2: ( - "-87.65492939166126,41.95377494531437,0 " - "-87.65557043199591,41.95376544118533,0 " - "-87.65606302030132,41.95376391658746,0 " - "-87.65623502146268,41.95377379126367,0 " - "-87.65634748981634,41.95380103566435,0 " - "-87.65646537904269,41.95387703994676,0 " - "-87.65656532461145,41.95396622645799,0 " - "-87.65664760856414,41.95404201996044,0 " - "-87.65671750555913,41.95416647054043,0 " - "-87.65673983607117,41.95429949810849,0 " - "-87.65673866475777,41.95441024240925,0 " - "-87.6567690255541,41.95490657227902,0 " - "-87.65683672482363,41.95692259283837,0 " - "-87.6568900886376,41.95861070983142,0 " - "-87.65699865558875,41.96181418669004,0 " - "-87.65756347177603,41.96397045777844,0 " - "-87.65758750947528,41.96427269188822,0" - ), - 3: ( - "-87.65362593118043,41.94742799535678,0 " - "-87.65363554415794,41.94819886386848,0 " - "-87.6536456393239,41.95059994675451,0 " - "-87.65365831235026,41.95108288489359,0 " - "-87.6536604873874,41.9519954657554,0 " - "-87.65362592053201,41.95245597302328,0 " - "-87.65367158496069,41.95311153649393,0 " - "-87.65368468595476,41.9533202828916,0 " - "-87.65369271253692,41.95343095587119,0 " - "-87.65373335834569,41.95351536301472,0 " - "-87.65378605844126,41.95358212680591,0 " - "-87.65385067928185,41.95364452823767,0 " - "-87.6539390793817,41.95370263886964,0 " - "-87.6540786298351,41.95373403675265,0 " - "-87.65430648647626,41.9537535411832,0 " - "-87.65492939166126,41.95377494531437,0" - ), - 4: ( - "-87.65345391792157,41.94217681262115,0 " - "-87.65342448305786,41.94237224420864,0 " - "-87.65339745703922,41.94268217746244,0 " - "-87.65337753982941,41.94288140770284,0 " - "-87.65336256753105,41.94317369618263,0 " - "-87.65338799707138,41.94357253961736,0 " - "-87.65340240886648,41.94389158188269,0 " - "-87.65341837392448,41.94406444407721,0 " - "-87.65342275247338,41.94421065714904,0 " - "-87.65347469646018,41.94434829382345,0 " - "-87.65351486483024,41.94447699917548,0 " - "-87.65353483605053,41.9453896864472,0 " - "-87.65361975532807,41.94689193720703,0 " - "-87.65362593118043,41.94742799535678,0" - ), - }, - } - - df_expected = DataFrame(data) df_style = read_xml( kml, xpath=".//k:Placemark", @@ -789,7 +760,7 @@ def test_stylesheet_file(datapath): stylesheet=xsl, ) - tm.assert_frame_equal(df_expected, df_style) + tm.assert_frame_equal(df_kml, df_style) @td.skip_if_no("lxml") @@ -821,7 +792,14 @@ def test_stylesheet_buffered_reader(datapath, mode): with open(xsl, mode) as f: xsl_obj = f.read() - read_xml(kml, stylesheet=xsl_obj) + df_style = read_xml( + kml, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=xsl_obj, + ) + + tm.assert_frame_equal(df_kml, df_style) @td.skip_if_no("lxml") @@ -934,18 +912,17 @@ def test_wrong_stylesheet(): @td.skip_if_no("lxml") -def test_stylesheet_not_path_buffer(): - from lxml.etree import XMLSyntaxError +def test_stylesheet_file_close(datapath): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") - kml = os.path.join("data", "xml", "cta_rail_lines.kml") + with open(xsl, "rb") as f: + read_xml(kml, stylesheet=f.read()) - with pytest.raises( - (AttributeError, XMLSyntaxError), - match=("__enter__|Start tag expected, '<' not found"), - ): - read_xml(kml, stylesheet={"a": 1}) + assert not f.closed +@td.skip_if_no("lxml") def test_stylesheet_with_etree(datapath): kml = os.path.join("data", "xml", "cta_rail_lines.kml") xsl = os.path.join("data", "xml", "flatten_doc.xsl") From 5c0af6ec1206e10393b096d05b01b1629fc04684 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 25 Feb 2021 15:12:33 -0600 Subject: [PATCH 33/35] Update tests to conform to mypy --- pandas/tests/io/xml/test_to_xml.py | 204 +++++++++-------------------- pandas/tests/io/xml/test_xml.py | 46 +++++-- 2 files changed, 99 insertions(+), 151 deletions(-) diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 3b915a9664210..2026035a23370 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -4,6 +4,7 @@ ) import os import sys +from typing import Union import numpy as np import pytest @@ -135,6 +136,17 @@ """ +def equalize_decl(doc): + # etree and lxml differ on quotes and case in xml declaration + if doc is not None: + doc = doc.replace( + '""" output = geom_df.to_xml(na_rep="0.0", parser=parser) - - # etree and lxml differs on quotes and case in xml declaration - output = output.replace( - '""" output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser) - - # etree and lxml differs on quotes and case in xml declaration - output = output.replace( - '""" output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser) - - # etree and lxml differs on quotes and case in xml declaration - output = output.replace( - '" ) - output = geom_df.to_xml(pretty_print=False) + output = geom_df.to_xml(pretty_print=False, parser="lxml") + output = equalize_decl(output) - output = output.replace( - '", "/>") + # etree adds space for closed tags + if output is not None: + output = output.replace(" />", "/>") assert output == expected @@ -1037,14 +959,17 @@ def test_stylesheet_file_like(datapath, mode): @td.skip_if_no("lxml") def test_stylesheet_io(datapath, mode): - xsl = datapath("io", "data", "xml", "row_field_output.xsl") + xsl_path = datapath("io", "data", "xml", "row_field_output.xsl") - with open(xsl, mode) as f: - xsl_obj = f.read() + xsl_obj: Union[BytesIO, StringIO] - xsl_io = BytesIO(xsl_obj) if isinstance(xsl_obj, bytes) else StringIO(xsl_obj) + with open(xsl_path, mode) as f: + if mode == "rb": + xsl_obj = BytesIO(f.read()) + else: + xsl_obj = StringIO(f.read()) - output = geom_df.to_xml(stylesheet=xsl_io) + output = geom_df.to_xml(stylesheet=xsl_obj) assert output == xsl_expected @@ -1202,7 +1127,10 @@ def test_style_to_csv(): """ - out_csv = geom_df.to_csv(line_terminator="\n").strip() + out_csv = geom_df.to_csv(line_terminator="\n") + + if out_csv is not None: + out_csv = out_csv.strip() out_xml = geom_df.to_xml(stylesheet=xsl) assert out_csv == out_xml @@ -1326,11 +1254,7 @@ def test_compression_output(parser, comp): ) as handle_obj: output = handle_obj.handle.read() - # etree and lxml differs on quotes and case in xml declaration - output = output.replace( - ' Date: Sat, 27 Feb 2021 07:46:50 -0600 Subject: [PATCH 34/35] Import methods to avoid duplication and add typing to parse_doc --- pandas/io/formats/xml.py | 85 ++++++---------------------------------- pandas/io/xml.py | 19 +++++---- 2 files changed, 25 insertions(+), 79 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index fd03dcd342089..11a9a2a54e717 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -13,7 +13,6 @@ ) from pandas._typing import ( - Buffer, CompressionOptions, FilePathOrBuffer, StorageOptions, @@ -22,14 +21,12 @@ from pandas.core.dtypes.common import is_list_like -from pandas.io.common import ( - file_exists, - get_handle, - is_fsspec_url, - is_url, - stringify_path, -) +from pandas.io.common import get_handle from pandas.io.formats.format import DataFrameFormatter +from pandas.io.xml import ( + get_data_from_filepath, + preprocess_data, +) class BaseXMLFormatter: @@ -436,6 +433,11 @@ class LxmlXMLFormatter(BaseXMLFormatter): modules: `xml.etree.ElementTree` and `xml.dom.minidom`. """ + from lxml.etree import ( + Element, + ElementTree, + ) + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -561,7 +563,7 @@ def build_elems(self) -> None: except KeyError: raise KeyError(f"no valid column, {col}") - def parse_doc(self): + def parse_doc(self) -> Union[Element, ElementTree]: """ Build tree from stylesheet. @@ -577,14 +579,14 @@ def parse_doc(self): style_doc = self.stylesheet - handle_data = _get_data_from_filepath( + handle_data = get_data_from_filepath( filepath_or_buffer=style_doc, encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, ) - with _preprocess_data(handle_data) as xml_data: + with preprocess_data(handle_data) as xml_data: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): @@ -611,64 +613,3 @@ def transform_doc(self) -> bytes: new_doc = transformer(self.root) return bytes(new_doc) - - -def _get_data_from_filepath( - filepath_or_buffer, - encoding, - compression, - storage_options, -) -> Union[str, bytes, Buffer]: - """ - Extract raw XML data. - - The method accepts three input types: - 1. filepath (string-like) - 2. file-like object (e.g. open file object, StringIO) - 3. XML string or bytes - - This method turns (1) into (2) to simplify the rest of the processing. - It returns input types (2) and (3) unchanged. - """ - filepath_or_buffer = stringify_path(filepath_or_buffer) - - if ( - isinstance(filepath_or_buffer, str) - and not filepath_or_buffer.startswith((" Union[io.StringIO, io.BytesIO]: - """ - Convert extracted raw data. - - This method will return underlying data of extracted XML content. - The data either has a `read` attribute (e.g. a file object or a - StringIO/BytesIO) or is a string or bytes that is an XML document. - """ - if isinstance(data, str): - data = io.StringIO(data) - - elif isinstance(data, bytes): - data = io.BytesIO(data) - - return data diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 122057c2c625d..a797b30e17ba7 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -362,14 +362,14 @@ def _parse_doc(self) -> Union[Element, ElementTree]: parse, ) - handle_data = _get_data_from_filepath( + handle_data = get_data_from_filepath( filepath_or_buffer=self.path_or_buffer, encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, ) - with _preprocess_data(handle_data) as xml_data: + with preprocess_data(handle_data) as xml_data: curr_parser = XMLParser(encoding=self.encoding) r = parse(xml_data, parser=curr_parser) @@ -383,6 +383,11 @@ class _LxmlFrameParser(_XMLFrameParser): XPath 1.0 and XSLT 1.0. """ + from lxml.etree import ( + Element, + ElementTree, + ) + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -554,7 +559,7 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self): + def _parse_doc(self) -> Union[Element, ElementTree]: from lxml.etree import ( XMLParser, fromstring, @@ -563,14 +568,14 @@ def _parse_doc(self): raw_doc = self.stylesheet if self.is_style else self.path_or_buffer - handle_data = _get_data_from_filepath( + handle_data = get_data_from_filepath( filepath_or_buffer=raw_doc, encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, ) - with _preprocess_data(handle_data) as xml_data: + with preprocess_data(handle_data) as xml_data: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): @@ -583,7 +588,7 @@ def _parse_doc(self): return r -def _get_data_from_filepath( +def get_data_from_filepath( filepath_or_buffer, encoding, compression, @@ -627,7 +632,7 @@ def _get_data_from_filepath( return filepath_or_buffer -def _preprocess_data(data) -> Union[io.StringIO, io.BytesIO]: +def preprocess_data(data) -> Union[io.StringIO, io.BytesIO]: """ Convert extracted raw data. From 6194f83c151f3c3b27ac3f50f594c1dd60d23a0a Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 27 Feb 2021 10:07:25 -0600 Subject: [PATCH 35/35] Refactor code and revert changes to avoid optional module type hints --- pandas/io/formats/xml.py | 30 +++++++----------------------- pandas/io/xml.py | 20 ++++++-------------- 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 11a9a2a54e717..044b03ba83714 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -433,11 +433,6 @@ class LxmlXMLFormatter(BaseXMLFormatter): modules: `xml.etree.ElementTree` and `xml.dom.minidom`. """ - from lxml.etree import ( - Element, - ElementTree, - ) - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -563,15 +558,17 @@ def build_elems(self) -> None: except KeyError: raise KeyError(f"no valid column, {col}") - def parse_doc(self) -> Union[Element, ElementTree]: + def transform_doc(self) -> bytes: """ - Build tree from stylesheet. + Parse stylesheet from file or buffer and run it. This method will parse stylesheet object into tree for parsing - conditionally by its specific object type. + conditionally by its specific object type, then transforms + original tree with XSLT script. """ from lxml.etree import ( + XSLT, XMLParser, fromstring, parse, @@ -590,24 +587,11 @@ def parse_doc(self) -> Union[Element, ElementTree]: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): - r = fromstring( + xsl_doc = fromstring( xml_data.getvalue().encode(self.encoding), parser=curr_parser ) else: - r = parse(xml_data, parser=curr_parser) - - return r - - def transform_doc(self) -> bytes: - """ - Transform original tree using stylesheet. - - This method will transform built tree with XSLT script. - """ - - from lxml.etree import XSLT - - xsl_doc = self.parse_doc() + xsl_doc = parse(xml_data, parser=curr_parser) transformer = XSLT(xsl_doc) new_doc = transformer(self.root) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a797b30e17ba7..83eba5f17c7b3 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -383,11 +383,6 @@ class _LxmlFrameParser(_XMLFrameParser): XPath 1.0 and XSLT 1.0. """ - from lxml.etree import ( - Element, - ElementTree, - ) - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -400,11 +395,10 @@ def parse_data(self) -> List[Dict[str, Optional[str]]]: and parse original or transformed XML and return specific nodes. """ - self.xml_doc = self._parse_doc() + self.xml_doc = self._parse_doc(self.path_or_buffer) if self.stylesheet is not None: - self.is_style = True - self.xsl_doc = self._parse_doc() + self.xsl_doc = self._parse_doc(self.stylesheet) self.xml_doc = self._transform_doc() self._validate_path() @@ -559,15 +553,13 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self) -> Union[Element, ElementTree]: + def _parse_doc(self, raw_doc): from lxml.etree import ( XMLParser, fromstring, parse, ) - raw_doc = self.stylesheet if self.is_style else self.path_or_buffer - handle_data = get_data_from_filepath( filepath_or_buffer=raw_doc, encoding=self.encoding, @@ -579,13 +571,13 @@ def _parse_doc(self) -> Union[Element, ElementTree]: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): - r = fromstring( + doc = fromstring( xml_data.getvalue().encode(self.encoding), parser=curr_parser ) else: - r = parse(xml_data, parser=curr_parser) + doc = parse(xml_data, parser=curr_parser) - return r + return doc def get_data_from_filepath(