Merge pull request #7531 from dsm054/excel-via-url

jreback · jreback · commit 9c22ba1a27cf · 2014-06-23T10:03:11.000-04:00
WIP/ENH: allow read_excel to accept URLs (GH6809)
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -108,6 +108,9 @@ Enhancements
 - ``read_html`` now sports an ``encoding`` argument that is passed to the
   underlying parser library. You can use this to read non-ascii encoded web
   pages (:issue:`7323`).
+- ``read_excel`` now supports reading from URLs in the same way
+  that ``read_csv`` does.  (:issue:`6809`)
+
 
 - Support for dateutil timezones, which can now be used in the same way as
   pytz timezones across pandas. (:issue:`4688`)
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -10,6 +10,7 @@
 import numpy as np
 
 from pandas.io.parsers import TextParser
+from pandas.io.common import _is_url, _urlopen
 from pandas.tseries.period import Period
 from pandas import json
 from pandas.compat import map, zip, reduce, range, lrange, u, add_metaclass
@@ -56,8 +57,10 @@ def read_excel(io, sheetname=0, **kwds):
 
     Parameters
     ----------
-    io : string, file-like object or xlrd workbook
-        If a string, expected to be a path to xls or xlsx file
+    io : string, file-like object, or xlrd workbook. 
+        The string could be a URL. Valid URL schemes include http, ftp, s3,
+        and file. For file URLs, a host is expected. For instance, a local
+        file could be file://localhost/path/to/workbook.xlsx
     sheetname : string or int, default 0
         Name of Excel sheet or the page number of the sheet
     header : int, default 0
@@ -98,6 +101,7 @@ def read_excel(io, sheetname=0, **kwds):
     -------
     parsed : DataFrame
         DataFrame from the passed in Excel file
+
     """
     if 'kind' in kwds:
         kwds.pop('kind')
@@ -139,11 +143,16 @@ def __init__(self, io, **kwds):
             raise ValueError("Unknown engine: %s" % engine)
 
         if isinstance(io, compat.string_types):
-            self.book = xlrd.open_workbook(io)
-        elif engine == "xlrd" and isinstance(io, xlrd.Book):
+            if _is_url(io):
+                data = _urlopen(io).read()
+                self.book = xlrd.open_workbook(file_contents=data)
+            else:
+                self.book = xlrd.open_workbook(io)
+        elif engine == 'xlrd' and isinstance(io, xlrd.Book):
             self.book = io
-        elif hasattr(io, "read"):
-            data = io.read()
+        elif not isinstance(io, xlrd.Book) and hasattr(io, "read"):
+            # N.B. xlrd.Book has a read attribute too
+            data = io.read() 
             self.book = xlrd.open_workbook(file_contents=data)
         else:
             raise ValueError('Must explicitly set engine if not passing in'
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -2,6 +2,7 @@
 
 from pandas.compat import u, range, map, openpyxl_compat
 from datetime import datetime, date, time
+import sys
 import os
 from distutils.version import LooseVersion
 
@@ -11,13 +12,15 @@
 
 from numpy import nan
 import numpy as np
+from numpy.testing.decorators import slow
 
 from pandas import DataFrame, Index, MultiIndex
 from pandas.io.parsers import read_csv
 from pandas.io.excel import (
     ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _OpenpyxlWriter,
     register_writer, _XlsxWriter
 )
+from pandas.io.common import URLError
 from pandas.util.testing import ensure_clean
 from pandas.core.config import set_option, get_option
 import pandas.util.testing as tm
@@ -280,6 +283,39 @@ def test_read_xlrd_Book(self):
             result = read_excel(book, sheetname="SheetA", engine="xlrd")
             tm.assert_frame_equal(df, result)
 
+    @tm.network
+    def test_read_from_http_url(self):
+        _skip_if_no_xlrd()
+
+        url = ('https://raw.github.com/pydata/pandas/master/'
+               'pandas/io/tests/data/test.xlsx')
+        url_table = read_excel(url)
+        dirpath = tm.get_data_path()
+        localtable = os.path.join(dirpath, 'test.xlsx')
+        local_table = read_excel(localtable)
+        tm.assert_frame_equal(url_table, local_table)
+
+    @slow
+    def test_read_from_file_url(self):
+        _skip_if_no_xlrd()
+
+        # FILE
+        if sys.version_info[:2] < (2, 6):
+            raise nose.SkipTest("file:// not supported with Python < 2.6")
+        dirpath = tm.get_data_path()
+        localtable = os.path.join(dirpath, 'test.xlsx')
+        local_table = read_excel(localtable)
+
+        try:
+            url_table = read_excel('file://localhost/' + localtable)
+        except URLError:
+            # fails on some systems
+            raise nose.SkipTest("failing on %s" %
+                                ' '.join(platform.uname()).strip())
+
+        tm.assert_frame_equal(url_table, local_table)
+
+
     def test_xlsx_table(self):
         _skip_if_no_xlrd()
         _skip_if_no_openpyxl()