pandas-dev
diff --git a/‎pandas/io/excel/_base.py
Lines changed: 2 additions & 0 deletions b/‎pandas/io/excel/_base.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎pandas/io/excel/_odfreader.py
Lines changed: 165 additions & 0 deletions b/‎pandas/io/excel/_odfreader.py
Lines changed: 165 additions & 0 deletions
diff --git a/‎pandas/tests/io/data/blank-row-repeat.ods
10.6 KB b/‎pandas/tests/io/data/blank-row-repeat.ods
10.6 KB
diff --git a/‎pandas/tests/io/data/datatypes.ods
10.4 KB b/‎pandas/tests/io/data/datatypes.ods
10.4 KB
diff --git a/‎pandas/tests/io/data/headers.ods
8.13 KB b/‎pandas/tests/io/data/headers.ods
8.13 KB
diff --git a/‎pandas/tests/io/data/invalid_value_type.ods
8.3 KB b/‎pandas/tests/io/data/invalid_value_type.ods
8.3 KB
diff --git a/‎pandas/tests/io/data/lowerdiagonal.ods
7.35 KB b/‎pandas/tests/io/data/lowerdiagonal.ods
7.35 KB
diff --git a/‎pandas/tests/io/data/runlengthencoding.ods
7.71 KB b/‎pandas/tests/io/data/runlengthencoding.ods
7.71 KB
diff --git a/‎pandas/tests/io/data/writertable.odt
10.1 KB b/‎pandas/tests/io/data/writertable.odt
10.1 KB
diff --git a/‎pandas/tests/io/test_excel.py
Lines changed: 132 additions & 0 deletions b/‎pandas/tests/io/test_excel.py
Lines changed: 132 additions & 0 deletions
@@ -749,9 +749,11 @@ class ExcelFile(object):
     """
 
     from pandas.io.excel._xlrd import _XlrdReader
+    from pandas.io.excel._odfreader import _ODFReader
 
     _engines = {
         'xlrd': _XlrdReader,
+        'odf': _ODFReader,
     }
 
     def __init__(self, io, engine=None):
 
@@ -0,0 +1,165 @@
+import pandas
+from pandas.io.parsers import TextParser
+
+
+class _ODFReader(object):
+    """Read tables out of OpenDocument formatted files
+
+    Parameters
+    ----------
+    filepath_or_stream: string, path to be parsed or
+        an open readable stream.
+    """
+    def __init__(self, filepath_or_stream):
+        try:
+            from odf.opendocument import load as document_load
+            from odf.table import Table
+        except ImportError:
+            raise ImportError("Install odfpy for OpenDocument support")
+
+        self.filepath_or_stream = None
+        self.document = None
+        self.tables = None
+        self.filepath_or_stream = filepath_or_stream
+        self.document = document_load(filepath_or_stream)
+        self.tables = self.document.getElementsByType(Table)
+
+    @property
+    def sheet_names(self):
+        """Return table names is the document"""
+        from odf.namespaces import TABLENS
+        return [t.attributes[(TABLENS, 'name')] for t in self.tables]
+
+    def get_sheet_by_index(self, index):
+        return self.__get_table(self.tables[index])
+
+    def get_sheet_by_name(self, name):
+        i = self.sheet_names.index(name)
+        return self.__get_table(self.tables[i])
+
+    def get_sheet(self, name):
+        """Given a sheet name or index, return the root ODF Table node
+        """
+        if isinstance(name, str):
+            return self.get_sheet_by_name(name)
+        elif isinstance(name, int):
+            return self.get_sheet_by_index(name)
+        else:
+            raise ValueError(
+                'Unrecognized sheet identifier type {}. Please use'
+                'a string or integer'.format(type(name)))
+
+    def parse(self, sheet_name=0, **kwds):
+        data = self.get_sheet(sheet_name)
+        parser = TextParser(data, **kwds)
+        return parser.read()
+
+    def __get_table(self, sheet):
+        """Parse an ODF Table into a list of lists
+        """
+        from odf.table import TableCell, TableRow
+
+        sheet_rows = sheet.getElementsByType(TableRow)
+        table = []
+        empty_rows = 0
+        max_row_len = 0
+        for i, sheet_row in enumerate(sheet_rows):
+            sheet_cells = sheet_row.getElementsByType(TableCell)
+            empty_cells = 0
+            table_row = []
+            for j, sheet_cell in enumerate(sheet_cells):
+                value = self.__get_cell_value(sheet_cell)
+                column_repeat = self.__get_cell_repeat(sheet_cell)
+
+                if len(sheet_cell.childNodes) == 0:
+                    empty_cells += column_repeat
+                else:
+                    if empty_cells > 0:
+                        table_row.extend([None] * empty_cells)
+                        empty_cells = 0
+                    table_row.extend([value] * column_repeat)
+
+            if max_row_len < len(table_row):
+                max_row_len = len(table_row)
+
+            row_repeat = self.__get_row_repeat(sheet_row)
+            if self.__is_empty_row(sheet_row):
+                empty_rows += row_repeat
+            else:
+                if empty_rows > 0:
+                    # add blank rows to our table
+                    table.extend([[None]] * empty_rows)
+                    empty_rows = 0
+                table.append(table_row)
+
+        # Make our table square
+        for row in table:
+            if len(row) < max_row_len:
+                row.extend([None] * (max_row_len - len(row)))
+
+        return table
+
+    def __get_row_repeat(self, row):
+        """Return number of times this row was repeated
+
+        Repeating an empty row appeared to be a common way
+        of representing sparse rows in the table.
+        """
+        from odf.namespaces import TABLENS
+        repeat = row.attributes.get((TABLENS, 'number-rows-repeated'))
+        if repeat is None:
+            return 1
+        return int(repeat)
+
+    def __get_cell_repeat(self, cell):
+        from odf.namespaces import TABLENS
+        repeat = cell.attributes.get((TABLENS, 'number-columns-repeated'))
+        if repeat is None:
+            return 1
+        return int(repeat)
+
+    def __is_empty_row(self, row):
+        """Helper function to find empty rows
+        """
+        for column in row.childNodes:
+            if len(column.childNodes) > 0:
+                return False
+
+        return True
+
+    def __get_cell_value(self, cell):
+        from odf.namespaces import OFFICENS
+        cell_type = cell.attributes.get((OFFICENS, 'value-type'))
+        if cell_type == 'boolean':
+            cell_value = cell.attributes.get((OFFICENS, 'boolean'))
+            return bool(cell_value)
+        elif cell_type in ('float', 'percentage'):
+            cell_value = cell.attributes.get((OFFICENS, 'value'))
+            return float(cell_value)
+        elif cell_type == 'string':
+            return str(cell)
+        elif cell_type == 'currency':
+            cell_value = cell.attributes.get((OFFICENS, 'value'))
+            return float(cell_value)
+        elif cell_type == 'date':
+            cell_value = cell.attributes.get((OFFICENS, 'date-value'))
+            return pandas.Timestamp(cell_value)
+        elif cell_type == 'time':
+            cell_value = cell.attributes.get((OFFICENS, 'time-value'))
+            return(pandas_isoduration_compatibility(cell_value))
+        elif cell_type is None:
+            return None
+        else:
+            raise ValueError('Unrecognized type {}'.format(cell_type))
+
+
+def pandas_isoduration_compatibility(duration):
+    """Libreoffice returns durations without any day attributes
+
+    For example PT3H45M0S. The current pandas Timedelta
+    parse requires the presence of a day component.
+    Workaround for https://github.com/pandas-dev/pandas/issues/25422
+    """
+    if duration.startswith('PT'):
+        duration = 'P0DT' + duration[2:]
+    return pandas.Timedelta(duration)
@@ -2557,3 +2557,135 @@ def test_excelwriter_fspath(self):
         with tm.ensure_clean('foo.xlsx') as path:
             writer = ExcelWriter(path)
             assert os.fspath(writer) == str(path)
+
+
+@td.skip_if_no('odf')
+class TestODFReader(SharedItems):
+    def test_get_sheet(self):
+        from pandas.io.excel._odfreader import _ODFReader
+
+        pth = os.path.join(self.dirpath, 'datatypes.ods')
+        book = _ODFReader(pth)
+
+        with pytest.raises(ValueError):
+            book.get_sheet(3.14)
+
+        with pytest.raises(ValueError):
+            book.get_sheet_by_name("Invalid Sheet 77")
+
+        with pytest.raises(IndexError):
+            book.get_sheet_by_index(-33)
+
+        assert len(book.sheet_names) == 1
+        assert book.sheet_names == ['Sheet1']
+
+    def test_read_types(self):
+        """Make sure we read ODF data types correctly
+        """
+        sheet = self.get_exceldf(
+            'datatypes', '.ods', header=None, engine='odf')
+
+        expected = DataFrame(
+            [[1.0],
+             [1.25],
+             ['a'],
+             [pd.Timestamp(2003, 1, 2)],
+             [False],
+             [0.35],
+             [pd.Timedelta(hours=3, minutes=45),
+              pd.Timedelta(hours=17, minutes=53),
+              pd.Timedelta(hours=14, minutes=8)],
+             # though what should the value of a hyperlink be?
+             ['UBERON:0002101']])
+        tm.assert_equal(sheet, expected)
+
+    def test_read_invalid_types(self):
+        """Make sure we throw an exception when encountering a new value-type
+
+        I had to manually create an invalid ods file by directly
+        editing the extracted xml. So it probably won't open in
+        LibreOffice correctly.
+        """
+        with pytest.raises(ValueError,
+                           match="Unrecognized type awesome_new_type"):
+            self.get_exceldf(
+                'invalid_value_type', '.ods', header=None, engine='odf')
+
+    def test_read_lower_diagonal(self):
+        """TextParser failed when given an irregular list of lists
+
+        Make sure we can parse:
+        1
+        2 3
+        4 5 6
+        7 8 9 10
+        """
+        sheet = self.get_exceldf(
+            'lowerdiagonal', '.ods', 'Sheet1',
+            index_col=None, header=None, engine='odf')
+
+        assert sheet.shape == (4, 4)
+
+    def test_read_headers(self):
+        """Do we read headers correctly?
+        """
+        sheet = self.get_exceldf(
+            'headers', '.ods', 'Sheet1', index_col=0, engine='odf')
+
+        expected = DataFrame.from_dict(OrderedDict([
+            ("Header", ["Row 1", "Row 2"]),
+            ("Column 1", [1.0, 2.0]),
+            ("Column 2", [3.0, 4.0]),
+            # Empty Column
+            ("Column 4", [7.0, 8.0]),
+            # Empty Column 2
+            ("Column 6", [11.0, 12.0])]))
+        expected.set_index("Header", inplace=True)
+        columns = ["Column 1", "Column 2", "Column 4", "Column 6"]
+        tm.assert_equal(sheet[columns], expected)
+        empties = [None, 'None.1']
+        for name in empties:
+            for value in sheet[name]:
+                assert pd.isnull(value)
+
+    def test_read_writer_table(self):
+        """ODF reuses the same table tags in Writer and Presentation files
+
+        Test reading a table out of a text document
+        """
+        table = self.get_exceldf(
+            'writertable', '.odt', 'Table1', index_col=0, engine='odf')
+
+        assert table.shape == (3, 3)
+        expected = DataFrame.from_dict(OrderedDict([
+            ("Header", ["Row 1", "Row 2", "Row 3"]),
+            ("Column 1", [1.0, 2.0, 3.0]),
+            ("Unnamed: 2", [nan, nan, nan]),
+            ("Column 3", [7.0, 8.0, 9.0])]))
+        expected.set_index("Header", inplace=True)
+        columns = ["Column 1", "Column 3"]
+        tm.assert_equal(table[columns], expected[columns])
+
+        # make sure pandas gives a name to the unnamed column
+        for i in range(3):
+            assert pd.isnull(table["Unnamed: 2"][i])
+
+    def test_blank_row_repeat(self):
+        table = self.get_exceldf(
+            'blank-row-repeat', '.ods', 'Value', engine='odf')
+
+        assert table.shape == (14, 2)
+        assert table['value'][7] == 9.0
+        assert pd.isnull(table['value'][8])
+        assert not pd.isnull(table['value'][11])
+
+    def test_runlengthencoding(self):
+        """Calc will use repeat when adjacent columns have the same value.
+        """
+        sheet = self.get_exceldf(
+            'runlengthencoding', '.ods', 'Sheet1', header=None, engine='odf')
+        assert sheet.shape == (5, 3)
+        # check by column, not by row.
+        assert list(sheet[0]) == [1.0, 1.0, 2.0, 2.0, 2.0]
+        assert list(sheet[1]) == [1.0, 2.0, 2.0, 2.0, 2.0]
+        assert list(sheet[2]) == [1.0, 2.0, 2.0, 2.0, 2.0]
Original file line number	Diff line number	Diff line change
`@@ -749,9 +749,11 @@ class ExcelFile(object):`
`749`	`749`	`"""`
`750`	`750`
`751`	`751`	`from pandas.io.excel._xlrd import _XlrdReader`
	`752`	`+ from pandas.io.excel._odfreader import _ODFReader`
`752`	`753`
`753`	`754`	`_engines = {`
`754`	`755`	`'xlrd': _XlrdReader,`
	`756`	`+ 'odf': _ODFReader,`
`755`	`757`	`}`
`756`	`758`
`757`	`759`	`def __init__(self, io, engine=None):`