Skip to content

Commit d8495dc

Browse files
committed
Class to read OpenDocument Tables
This is primarly intended for OpenDocument spreadsheets like what is generated by LibreOffice calc but will also work with LibreOffice Writer.
1 parent 15d8178 commit d8495dc

10 files changed

+299
-0
lines changed

pandas/io/excel/_base.py

+2
Original file line numberDiff line numberDiff line change
@@ -749,9 +749,11 @@ class ExcelFile(object):
749749
"""
750750

751751
from pandas.io.excel._xlrd import _XlrdReader
752+
from pandas.io.excel._odfreader import _ODFReader
752753

753754
_engines = {
754755
'xlrd': _XlrdReader,
756+
'odf': _ODFReader,
755757
}
756758

757759
def __init__(self, io, engine=None):

pandas/io/excel/_odfreader.py

+165
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import pandas
2+
from pandas.io.parsers import TextParser
3+
4+
5+
class _ODFReader(object):
6+
"""Read tables out of OpenDocument formatted files
7+
8+
Parameters
9+
----------
10+
filepath_or_stream: string, path to be parsed or
11+
an open readable stream.
12+
"""
13+
def __init__(self, filepath_or_stream):
14+
try:
15+
from odf.opendocument import load as document_load
16+
from odf.table import Table
17+
except ImportError:
18+
raise ImportError("Install odfpy for OpenDocument support")
19+
20+
self.filepath_or_stream = None
21+
self.document = None
22+
self.tables = None
23+
self.filepath_or_stream = filepath_or_stream
24+
self.document = document_load(filepath_or_stream)
25+
self.tables = self.document.getElementsByType(Table)
26+
27+
@property
28+
def sheet_names(self):
29+
"""Return table names is the document"""
30+
from odf.namespaces import TABLENS
31+
return [t.attributes[(TABLENS, 'name')] for t in self.tables]
32+
33+
def get_sheet_by_index(self, index):
34+
return self.__get_table(self.tables[index])
35+
36+
def get_sheet_by_name(self, name):
37+
i = self.sheet_names.index(name)
38+
return self.__get_table(self.tables[i])
39+
40+
def get_sheet(self, name):
41+
"""Given a sheet name or index, return the root ODF Table node
42+
"""
43+
if isinstance(name, str):
44+
return self.get_sheet_by_name(name)
45+
elif isinstance(name, int):
46+
return self.get_sheet_by_index(name)
47+
else:
48+
raise ValueError(
49+
'Unrecognized sheet identifier type {}. Please use'
50+
'a string or integer'.format(type(name)))
51+
52+
def parse(self, sheet_name=0, **kwds):
53+
data = self.get_sheet(sheet_name)
54+
parser = TextParser(data, **kwds)
55+
return parser.read()
56+
57+
def __get_table(self, sheet):
58+
"""Parse an ODF Table into a list of lists
59+
"""
60+
from odf.table import TableCell, TableRow
61+
62+
sheet_rows = sheet.getElementsByType(TableRow)
63+
table = []
64+
empty_rows = 0
65+
max_row_len = 0
66+
for i, sheet_row in enumerate(sheet_rows):
67+
sheet_cells = sheet_row.getElementsByType(TableCell)
68+
empty_cells = 0
69+
table_row = []
70+
for j, sheet_cell in enumerate(sheet_cells):
71+
value = self.__get_cell_value(sheet_cell)
72+
column_repeat = self.__get_cell_repeat(sheet_cell)
73+
74+
if len(sheet_cell.childNodes) == 0:
75+
empty_cells += column_repeat
76+
else:
77+
if empty_cells > 0:
78+
table_row.extend([None] * empty_cells)
79+
empty_cells = 0
80+
table_row.extend([value] * column_repeat)
81+
82+
if max_row_len < len(table_row):
83+
max_row_len = len(table_row)
84+
85+
row_repeat = self.__get_row_repeat(sheet_row)
86+
if self.__is_empty_row(sheet_row):
87+
empty_rows += row_repeat
88+
else:
89+
if empty_rows > 0:
90+
# add blank rows to our table
91+
table.extend([[None]] * empty_rows)
92+
empty_rows = 0
93+
table.append(table_row)
94+
95+
# Make our table square
96+
for row in table:
97+
if len(row) < max_row_len:
98+
row.extend([None] * (max_row_len - len(row)))
99+
100+
return table
101+
102+
def __get_row_repeat(self, row):
103+
"""Return number of times this row was repeated
104+
105+
Repeating an empty row appeared to be a common way
106+
of representing sparse rows in the table.
107+
"""
108+
from odf.namespaces import TABLENS
109+
repeat = row.attributes.get((TABLENS, 'number-rows-repeated'))
110+
if repeat is None:
111+
return 1
112+
return int(repeat)
113+
114+
def __get_cell_repeat(self, cell):
115+
from odf.namespaces import TABLENS
116+
repeat = cell.attributes.get((TABLENS, 'number-columns-repeated'))
117+
if repeat is None:
118+
return 1
119+
return int(repeat)
120+
121+
def __is_empty_row(self, row):
122+
"""Helper function to find empty rows
123+
"""
124+
for column in row.childNodes:
125+
if len(column.childNodes) > 0:
126+
return False
127+
128+
return True
129+
130+
def __get_cell_value(self, cell):
131+
from odf.namespaces import OFFICENS
132+
cell_type = cell.attributes.get((OFFICENS, 'value-type'))
133+
if cell_type == 'boolean':
134+
cell_value = cell.attributes.get((OFFICENS, 'boolean'))
135+
return bool(cell_value)
136+
elif cell_type in ('float', 'percentage'):
137+
cell_value = cell.attributes.get((OFFICENS, 'value'))
138+
return float(cell_value)
139+
elif cell_type == 'string':
140+
return str(cell)
141+
elif cell_type == 'currency':
142+
cell_value = cell.attributes.get((OFFICENS, 'value'))
143+
return float(cell_value)
144+
elif cell_type == 'date':
145+
cell_value = cell.attributes.get((OFFICENS, 'date-value'))
146+
return pandas.Timestamp(cell_value)
147+
elif cell_type == 'time':
148+
cell_value = cell.attributes.get((OFFICENS, 'time-value'))
149+
return(pandas_isoduration_compatibility(cell_value))
150+
elif cell_type is None:
151+
return None
152+
else:
153+
raise ValueError('Unrecognized type {}'.format(cell_type))
154+
155+
156+
def pandas_isoduration_compatibility(duration):
157+
"""Libreoffice returns durations without any day attributes
158+
159+
For example PT3H45M0S. The current pandas Timedelta
160+
parse requires the presence of a day component.
161+
Workaround for https://github.com/pandas-dev/pandas/issues/25422
162+
"""
163+
if duration.startswith('PT'):
164+
duration = 'P0DT' + duration[2:]
165+
return pandas.Timedelta(duration)
10.6 KB
Binary file not shown.

pandas/tests/io/data/datatypes.ods

10.4 KB
Binary file not shown.

pandas/tests/io/data/headers.ods

8.13 KB
Binary file not shown.
8.3 KB
Binary file not shown.
7.35 KB
Binary file not shown.
7.71 KB
Binary file not shown.

pandas/tests/io/data/writertable.odt

10.1 KB
Binary file not shown.

pandas/tests/io/test_excel.py

+132
Original file line numberDiff line numberDiff line change
@@ -2557,3 +2557,135 @@ def test_excelwriter_fspath(self):
25572557
with tm.ensure_clean('foo.xlsx') as path:
25582558
writer = ExcelWriter(path)
25592559
assert os.fspath(writer) == str(path)
2560+
2561+
2562+
@td.skip_if_no('odf')
2563+
class TestODFReader(SharedItems):
2564+
def test_get_sheet(self):
2565+
from pandas.io.excel._odfreader import _ODFReader
2566+
2567+
pth = os.path.join(self.dirpath, 'datatypes.ods')
2568+
book = _ODFReader(pth)
2569+
2570+
with pytest.raises(ValueError):
2571+
book.get_sheet(3.14)
2572+
2573+
with pytest.raises(ValueError):
2574+
book.get_sheet_by_name("Invalid Sheet 77")
2575+
2576+
with pytest.raises(IndexError):
2577+
book.get_sheet_by_index(-33)
2578+
2579+
assert len(book.sheet_names) == 1
2580+
assert book.sheet_names == ['Sheet1']
2581+
2582+
def test_read_types(self):
2583+
"""Make sure we read ODF data types correctly
2584+
"""
2585+
sheet = self.get_exceldf(
2586+
'datatypes', '.ods', header=None, engine='odf')
2587+
2588+
expected = DataFrame(
2589+
[[1.0],
2590+
[1.25],
2591+
['a'],
2592+
[pd.Timestamp(2003, 1, 2)],
2593+
[False],
2594+
[0.35],
2595+
[pd.Timedelta(hours=3, minutes=45),
2596+
pd.Timedelta(hours=17, minutes=53),
2597+
pd.Timedelta(hours=14, minutes=8)],
2598+
# though what should the value of a hyperlink be?
2599+
['UBERON:0002101']])
2600+
tm.assert_equal(sheet, expected)
2601+
2602+
def test_read_invalid_types(self):
2603+
"""Make sure we throw an exception when encountering a new value-type
2604+
2605+
I had to manually create an invalid ods file by directly
2606+
editing the extracted xml. So it probably won't open in
2607+
LibreOffice correctly.
2608+
"""
2609+
with pytest.raises(ValueError,
2610+
match="Unrecognized type awesome_new_type"):
2611+
self.get_exceldf(
2612+
'invalid_value_type', '.ods', header=None, engine='odf')
2613+
2614+
def test_read_lower_diagonal(self):
2615+
"""TextParser failed when given an irregular list of lists
2616+
2617+
Make sure we can parse:
2618+
1
2619+
2 3
2620+
4 5 6
2621+
7 8 9 10
2622+
"""
2623+
sheet = self.get_exceldf(
2624+
'lowerdiagonal', '.ods', 'Sheet1',
2625+
index_col=None, header=None, engine='odf')
2626+
2627+
assert sheet.shape == (4, 4)
2628+
2629+
def test_read_headers(self):
2630+
"""Do we read headers correctly?
2631+
"""
2632+
sheet = self.get_exceldf(
2633+
'headers', '.ods', 'Sheet1', index_col=0, engine='odf')
2634+
2635+
expected = DataFrame.from_dict(OrderedDict([
2636+
("Header", ["Row 1", "Row 2"]),
2637+
("Column 1", [1.0, 2.0]),
2638+
("Column 2", [3.0, 4.0]),
2639+
# Empty Column
2640+
("Column 4", [7.0, 8.0]),
2641+
# Empty Column 2
2642+
("Column 6", [11.0, 12.0])]))
2643+
expected.set_index("Header", inplace=True)
2644+
columns = ["Column 1", "Column 2", "Column 4", "Column 6"]
2645+
tm.assert_equal(sheet[columns], expected)
2646+
empties = [None, 'None.1']
2647+
for name in empties:
2648+
for value in sheet[name]:
2649+
assert pd.isnull(value)
2650+
2651+
def test_read_writer_table(self):
2652+
"""ODF reuses the same table tags in Writer and Presentation files
2653+
2654+
Test reading a table out of a text document
2655+
"""
2656+
table = self.get_exceldf(
2657+
'writertable', '.odt', 'Table1', index_col=0, engine='odf')
2658+
2659+
assert table.shape == (3, 3)
2660+
expected = DataFrame.from_dict(OrderedDict([
2661+
("Header", ["Row 1", "Row 2", "Row 3"]),
2662+
("Column 1", [1.0, 2.0, 3.0]),
2663+
("Unnamed: 2", [nan, nan, nan]),
2664+
("Column 3", [7.0, 8.0, 9.0])]))
2665+
expected.set_index("Header", inplace=True)
2666+
columns = ["Column 1", "Column 3"]
2667+
tm.assert_equal(table[columns], expected[columns])
2668+
2669+
# make sure pandas gives a name to the unnamed column
2670+
for i in range(3):
2671+
assert pd.isnull(table["Unnamed: 2"][i])
2672+
2673+
def test_blank_row_repeat(self):
2674+
table = self.get_exceldf(
2675+
'blank-row-repeat', '.ods', 'Value', engine='odf')
2676+
2677+
assert table.shape == (14, 2)
2678+
assert table['value'][7] == 9.0
2679+
assert pd.isnull(table['value'][8])
2680+
assert not pd.isnull(table['value'][11])
2681+
2682+
def test_runlengthencoding(self):
2683+
"""Calc will use repeat when adjacent columns have the same value.
2684+
"""
2685+
sheet = self.get_exceldf(
2686+
'runlengthencoding', '.ods', 'Sheet1', header=None, engine='odf')
2687+
assert sheet.shape == (5, 3)
2688+
# check by column, not by row.
2689+
assert list(sheet[0]) == [1.0, 1.0, 2.0, 2.0, 2.0]
2690+
assert list(sheet[1]) == [1.0, 2.0, 2.0, 2.0, 2.0]
2691+
assert list(sheet[2]) == [1.0, 2.0, 2.0, 2.0, 2.0]

0 commit comments

Comments
 (0)