Skip to content

Commit b11b2f8

Browse files
committed
Google Cloud Storage support using gcsfs
1 parent 172ab7a commit b11b2f8

File tree

9 files changed

+67
-7
lines changed

9 files changed

+67
-7
lines changed

ci/check_imports.py

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
blacklist = {
77
'bs4',
8+
'gcsfs',
89
'html5lib',
910
'ipython',
1011
'jinja2'

ci/requirements-optional-pip.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@ sqlalchemy
2626
xarray
2727
xlrd
2828
xlsxwriter
29-
xlwt
29+
xlwt

doc/source/install.rst

+1
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ Optional Dependencies
276276
* `Jinja2 <http://jinja.pocoo.org/>`__: Template engine for conditional HTML formatting.
277277
* `s3fs <http://s3fs.readthedocs.io/>`__: necessary for Amazon S3 access (s3fs >= 0.0.7).
278278
* `blosc <https://pypi.org/project/blosc>`__: for msgpack compression using ``blosc``
279+
* `gcsfs <http://gcsfs.readthedocs.io/>`__: necessary for Google Cloud Storage access (gcsfs >= 0.1.0).
279280
* One of
280281
`qtpy <https://github.com/spyder-ide/qtpy>`__ (requires PyQt or PySide),
281282
`PyQt5 <https://www.riverbankcomputing.com/software/pyqt/download5>`__,

pandas/io/common.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _is_url(url):
8888
"""
8989
try:
9090
return parse_url(url).scheme in _VALID_URLS
91-
except:
91+
except Exception:
9292
return False
9393

9494

@@ -165,7 +165,15 @@ def is_s3_url(url):
165165
"""Check for an s3, s3n, or s3a url"""
166166
try:
167167
return parse_url(url).scheme in ['s3', 's3n', 's3a']
168-
except: # noqa
168+
except Exception:
169+
return False
170+
171+
172+
def is_gcs_url(url):
173+
"""Check for a gcs url"""
174+
try:
175+
return parse_url(url).scheme in ['gcs', 'gs']
176+
except Exception:
169177
return False
170178

171179

@@ -208,6 +216,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
208216
compression=compression,
209217
mode=mode)
210218

219+
if is_gcs_url(filepath_or_buffer):
220+
from pandas.io import gcs
221+
return gcs.get_filepath_or_buffer(filepath_or_buffer,
222+
encoding=encoding,
223+
compression=compression,
224+
mode=mode)
225+
211226
if isinstance(filepath_or_buffer, (compat.string_types,
212227
compat.binary_type,
213228
mmap.mmap)):

pandas/io/excel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
io : string, path object (pathlib.Path or py._path.local.LocalPath),
4747
file-like object, pandas ExcelFile, or xlrd workbook.
4848
The string could be a URL. Valid URL schemes include http, ftp, s3,
49-
and file. For file URLs, a host is expected. For instance, a local
49+
gcs, and file. For file URLs, a host is expected. For instance, a local
5050
file could be file://localhost/path/to/workbook.xlsx
5151
sheet_name : string, int, mixed list of strings/ints, or None, default 0
5252

pandas/io/gcs.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
""" GCS support for remote file interactivity """
2+
try:
3+
import gcsfs
4+
except ImportError:
5+
raise ImportError("The gcsfs library is required to handle GCS files")
6+
7+
8+
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
9+
compression=None, mode=None):
10+
11+
if mode is None:
12+
mode = 'rb'
13+
14+
fs = gcsfs.GCSFileSystem()
15+
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
16+
return filepath_or_buffer, None, compression, True

pandas/io/json/json.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
231231
Parameters
232232
----------
233233
path_or_buf : a valid JSON string or file-like, default: None
234-
The string could be a URL. Valid URL schemes include http, ftp, s3, and
235-
file. For file URLs, a host is expected. For instance, a local file
236-
could be ``file://localhost/path/to/table.json``
234+
The string could be a URL. Valid URL schemes include http, ftp, s3,
235+
gcs, and file. For file URLs, a host is expected. For instance, a local
236+
file could be ``file://localhost/path/to/table.json``
237237
238238
orient : string,
239239
Indication of expected JSON string format.

pandas/tests/io/test_gcs.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from pandas import DataFrame, read_csv
2+
from pandas.compat import BytesIO
3+
from pandas.io.common import is_gcs_url
4+
5+
6+
class TestGCSURL(object):
7+
8+
def test_is_gcs_url(self):
9+
assert is_gcs_url("gcs://pandas/somethingelse.com")
10+
assert is_gcs_url("gs://pandas/somethingelse.com")
11+
assert not is_gcs_url("s3://pandas/somethingelse.com")
12+
13+
def test_read_csv_gcs(self):
14+
try:
15+
from unittest.mock import patch
16+
except ImportError:
17+
from mock import patch
18+
19+
with patch('gcsfs.GCSFileSystem') as MockFileSystem:
20+
instance = MockFileSystem.return_value
21+
instance.open.return_value = BytesIO(b'a,b\n1,2\n3,4')
22+
df = read_csv('gs://test/test.csv')
23+
24+
assert isinstance(df, DataFrame)
25+
assert len(df == 2)
26+
assert all(df.columns == ['a', 'b'])

pandas/util/_print_versions.py

+1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def show_versions(as_json=False):
9696
("fastparquet", lambda mod: mod.__version__),
9797
("pandas_gbq", lambda mod: mod.__version__),
9898
("pandas_datareader", lambda mod: mod.__version__),
99+
("gcsfs", lambda mod: mod.__version__),
99100
]
100101

101102
deps_blob = list()

0 commit comments

Comments
 (0)