Skip to content

Commit 9c22ba1

Browse files
committed
Merge pull request #7531 from dsm054/excel-via-url
WIP/ENH: allow read_excel to accept URLs (GH6809)
2 parents 384c195 + 6023e24 commit 9c22ba1

File tree

3 files changed

+54
-6
lines changed

3 files changed

+54
-6
lines changed

doc/source/v0.14.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ Enhancements
108108
- ``read_html`` now sports an ``encoding`` argument that is passed to the
109109
underlying parser library. You can use this to read non-ascii encoded web
110110
pages (:issue:`7323`).
111+
- ``read_excel`` now supports reading from URLs in the same way
112+
that ``read_csv`` does. (:issue:`6809`)
113+
111114

112115
- Support for dateutil timezones, which can now be used in the same way as
113116
pytz timezones across pandas. (:issue:`4688`)

pandas/io/excel.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import numpy as np
1111

1212
from pandas.io.parsers import TextParser
13+
from pandas.io.common import _is_url, _urlopen
1314
from pandas.tseries.period import Period
1415
from pandas import json
1516
from pandas.compat import map, zip, reduce, range, lrange, u, add_metaclass
@@ -56,8 +57,10 @@ def read_excel(io, sheetname=0, **kwds):
5657
5758
Parameters
5859
----------
59-
io : string, file-like object or xlrd workbook
60-
If a string, expected to be a path to xls or xlsx file
60+
io : string, file-like object, or xlrd workbook.
61+
The string could be a URL. Valid URL schemes include http, ftp, s3,
62+
and file. For file URLs, a host is expected. For instance, a local
63+
file could be file://localhost/path/to/workbook.xlsx
6164
sheetname : string or int, default 0
6265
Name of Excel sheet or the page number of the sheet
6366
header : int, default 0
@@ -98,6 +101,7 @@ def read_excel(io, sheetname=0, **kwds):
98101
-------
99102
parsed : DataFrame
100103
DataFrame from the passed in Excel file
104+
101105
"""
102106
if 'kind' in kwds:
103107
kwds.pop('kind')
@@ -139,11 +143,16 @@ def __init__(self, io, **kwds):
139143
raise ValueError("Unknown engine: %s" % engine)
140144

141145
if isinstance(io, compat.string_types):
142-
self.book = xlrd.open_workbook(io)
143-
elif engine == "xlrd" and isinstance(io, xlrd.Book):
146+
if _is_url(io):
147+
data = _urlopen(io).read()
148+
self.book = xlrd.open_workbook(file_contents=data)
149+
else:
150+
self.book = xlrd.open_workbook(io)
151+
elif engine == 'xlrd' and isinstance(io, xlrd.Book):
144152
self.book = io
145-
elif hasattr(io, "read"):
146-
data = io.read()
153+
elif not isinstance(io, xlrd.Book) and hasattr(io, "read"):
154+
# N.B. xlrd.Book has a read attribute too
155+
data = io.read()
147156
self.book = xlrd.open_workbook(file_contents=data)
148157
else:
149158
raise ValueError('Must explicitly set engine if not passing in'

pandas/io/tests/test_excel.py

+36
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from pandas.compat import u, range, map, openpyxl_compat
44
from datetime import datetime, date, time
5+
import sys
56
import os
67
from distutils.version import LooseVersion
78

@@ -11,13 +12,15 @@
1112

1213
from numpy import nan
1314
import numpy as np
15+
from numpy.testing.decorators import slow
1416

1517
from pandas import DataFrame, Index, MultiIndex
1618
from pandas.io.parsers import read_csv
1719
from pandas.io.excel import (
1820
ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _OpenpyxlWriter,
1921
register_writer, _XlsxWriter
2022
)
23+
from pandas.io.common import URLError
2124
from pandas.util.testing import ensure_clean
2225
from pandas.core.config import set_option, get_option
2326
import pandas.util.testing as tm
@@ -280,6 +283,39 @@ def test_read_xlrd_Book(self):
280283
result = read_excel(book, sheetname="SheetA", engine="xlrd")
281284
tm.assert_frame_equal(df, result)
282285

286+
@tm.network
287+
def test_read_from_http_url(self):
288+
_skip_if_no_xlrd()
289+
290+
url = ('https://raw.github.com/pydata/pandas/master/'
291+
'pandas/io/tests/data/test.xlsx')
292+
url_table = read_excel(url)
293+
dirpath = tm.get_data_path()
294+
localtable = os.path.join(dirpath, 'test.xlsx')
295+
local_table = read_excel(localtable)
296+
tm.assert_frame_equal(url_table, local_table)
297+
298+
@slow
299+
def test_read_from_file_url(self):
300+
_skip_if_no_xlrd()
301+
302+
# FILE
303+
if sys.version_info[:2] < (2, 6):
304+
raise nose.SkipTest("file:// not supported with Python < 2.6")
305+
dirpath = tm.get_data_path()
306+
localtable = os.path.join(dirpath, 'test.xlsx')
307+
local_table = read_excel(localtable)
308+
309+
try:
310+
url_table = read_excel('file://localhost/' + localtable)
311+
except URLError:
312+
# fails on some systems
313+
raise nose.SkipTest("failing on %s" %
314+
' '.join(platform.uname()).strip())
315+
316+
tm.assert_frame_equal(url_table, local_table)
317+
318+
283319
def test_xlsx_table(self):
284320
_skip_if_no_xlrd()
285321
_skip_if_no_openpyxl()

0 commit comments

Comments
 (0)