diff --git a/ci/appveyor-27.yaml b/ci/appveyor-27.yaml
index cfc6a796bd77e..10511ac0e00ca 100644
--- a/ci/appveyor-27.yaml
+++ b/ci/appveyor-27.yaml
@@ -6,6 +6,7 @@ dependencies:
- beautifulsoup4
- bottleneck
- dateutil
+ - gcsfs
- html5lib
- jinja2=2.8
- lxml
diff --git a/ci/check_imports.py b/ci/check_imports.py
index d6f24ebcc4d3e..3f09290f8c375 100644
--- a/ci/check_imports.py
+++ b/ci/check_imports.py
@@ -5,6 +5,7 @@
blacklist = {
'bs4',
+ 'gcsfs',
'html5lib',
'ipython',
'jinja2'
diff --git a/ci/circle-36-locale_slow.yaml b/ci/circle-36-locale_slow.yaml
index cc852c1e2aeeb..f44e98e1ee09d 100644
--- a/ci/circle-36-locale_slow.yaml
+++ b/ci/circle-36-locale_slow.yaml
@@ -5,6 +5,7 @@ channels:
dependencies:
- beautifulsoup4
- cython
+ - gcsfs
- html5lib
- ipython
- jinja2
diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt
index e8cfcdf80f2e8..9e4e8e99b5205 100644
--- a/ci/requirements-optional-conda.txt
+++ b/ci/requirements-optional-conda.txt
@@ -3,6 +3,7 @@ blosc
bottleneck
fastparquet
feather-format
+gcsfs
html5lib
ipython>=5.6.0
ipykernel
diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt
index 877c52fa0b4fd..3cce3f5339883 100644
--- a/ci/requirements-optional-pip.txt
+++ b/ci/requirements-optional-pip.txt
@@ -5,6 +5,7 @@ blosc
bottleneck
fastparquet
feather-format
+gcsfs
html5lib
ipython>=5.6.0
ipykernel
diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml
index 22b993a2da886..482b888b88062 100644
--- a/ci/travis-27.yaml
+++ b/ci/travis-27.yaml
@@ -9,6 +9,7 @@ dependencies:
- fastparquet
- feather-format
- flake8=3.4.1
+ - gcsfs
- html5lib
- ipython
- jemalloc=4.5.0.post
diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml
index 006276ba1a65f..ff4f1a4a86f99 100644
--- a/ci/travis-36.yaml
+++ b/ci/travis-36.yaml
@@ -8,6 +8,7 @@ dependencies:
- dask
- fastparquet
- feather-format
+ - gcsfs
- geopandas
- html5lib
- ipython
diff --git a/doc/source/install.rst b/doc/source/install.rst
index fa6b9f4fc7f4d..a8c5194124829 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -276,6 +276,7 @@ Optional Dependencies
* `Jinja2 `__: Template engine for conditional HTML formatting.
* `s3fs `__: necessary for Amazon S3 access (s3fs >= 0.0.7).
* `blosc `__: for msgpack compression using ``blosc``
+* `gcsfs `__: necessary for Google Cloud Storage access (gcsfs >= 0.1.0).
* One of
`qtpy `__ (requires PyQt or PySide),
`PyQt5 `__,
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index a63276efc5b7c..0fe036a2ee70f 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -18,7 +18,7 @@ Other Enhancements
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`)
-
+- Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`)
.. _whatsnew_0240.api_breaking:
diff --git a/pandas/conftest.py b/pandas/conftest.py
index d6b18db4e71f2..b4a599758417c 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1,3 +1,5 @@
+import importlib
+
import pytest
import numpy as np
@@ -249,3 +251,17 @@ def any_int_dtype(request):
"""
return request.param
+
+
+@pytest.fixture
+def mock():
+ """
+ Fixture providing the 'mock' module.
+
+ Uses 'unittest.mock' for Python 3. Attempts to import the 3rd party 'mock'
+ package for Python 2, skipping if not present.
+ """
+ if PY3:
+ return importlib.import_module("unittest.mock")
+ else:
+ return pytest.importorskip("mock")
diff --git a/pandas/io/common.py b/pandas/io/common.py
index ac9077f2db50e..6d579fc8a8a09 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -88,7 +88,7 @@ def _is_url(url):
"""
try:
return parse_url(url).scheme in _VALID_URLS
- except:
+ except Exception:
return False
@@ -165,7 +165,15 @@ def is_s3_url(url):
"""Check for an s3, s3n, or s3a url"""
try:
return parse_url(url).scheme in ['s3', 's3n', 's3a']
- except: # noqa
+ except Exception:
+ return False
+
+
+def is_gcs_url(url):
+ """Check for a gcs url"""
+ try:
+ return parse_url(url).scheme in ['gcs', 'gs']
+ except Exception:
return False
@@ -208,6 +216,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=compression,
mode=mode)
+ if is_gcs_url(filepath_or_buffer):
+ from pandas.io import gcs
+ return gcs.get_filepath_or_buffer(filepath_or_buffer,
+ encoding=encoding,
+ compression=compression,
+ mode=mode)
+
if isinstance(filepath_or_buffer, (compat.string_types,
compat.binary_type,
mmap.mmap)):
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
index e86d33742b266..793a95ffb0ee7 100644
--- a/pandas/io/excel.py
+++ b/pandas/io/excel.py
@@ -46,7 +46,7 @@
io : string, path object (pathlib.Path or py._path.local.LocalPath),
file-like object, pandas ExcelFile, or xlrd workbook.
The string could be a URL. Valid URL schemes include http, ftp, s3,
- and file. For file URLs, a host is expected. For instance, a local
+ gcs, and file. For file URLs, a host is expected. For instance, a local
file could be file://localhost/path/to/workbook.xlsx
sheet_name : string, int, mixed list of strings/ints, or None, default 0
diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py
new file mode 100644
index 0000000000000..aa1cb648f05d1
--- /dev/null
+++ b/pandas/io/gcs.py
@@ -0,0 +1,16 @@
+""" GCS support for remote file interactivity """
+try:
+ import gcsfs
+except ImportError:
+ raise ImportError("The gcsfs library is required to handle GCS files")
+
+
+def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
+ compression=None, mode=None):
+
+ if mode is None:
+ mode = 'rb'
+
+ fs = gcsfs.GCSFileSystem()
+ filepath_or_buffer = fs.open(filepath_or_buffer, mode)
+ return filepath_or_buffer, None, compression, True
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 1627b2f4d3ec3..9992be521d61f 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -231,9 +231,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
Parameters
----------
path_or_buf : a valid JSON string or file-like, default: None
- The string could be a URL. Valid URL schemes include http, ftp, s3, and
- file. For file URLs, a host is expected. For instance, a local file
- could be ``file://localhost/path/to/table.json``
+ The string could be a URL. Valid URL schemes include http, ftp, s3,
+ gcs, and file. For file URLs, a host is expected. For instance, a local
+ file could be ``file://localhost/path/to/table.json``
orient : string,
Indication of expected JSON string format.
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index b4f5d67530fbd..65527ac1b278f 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -20,7 +20,7 @@
DatetimeIndex, TimedeltaIndex, Timestamp,
Panel, Period, Categorical, isna, Interval,
DateOffset)
-from pandas.compat import u, PY2, PY3, StringIO, lrange
+from pandas.compat import u, PY2, StringIO, lrange
from pandas.core.dtypes import inference
from pandas.core.dtypes.common import (
is_timedelta64_dtype,
@@ -128,7 +128,7 @@ def test_is_dict_like_fails(ll):
assert not inference.is_dict_like(ll)
-def test_is_file_like():
+def test_is_file_like(mock):
class MockFile(object):
pass
@@ -166,10 +166,7 @@ class MockFile(object):
# Iterator but no read / write attributes
data = [1, 2, 3]
assert not is_file(data)
-
- if PY3:
- from unittest import mock
- assert not is_file(mock.Mock())
+ assert not is_file(mock.Mock())
@pytest.mark.parametrize(
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
index b39122e5e7906..6e1d3575a1481 100644
--- a/pandas/tests/io/parser/common.py
+++ b/pandas/tests/io/parser/common.py
@@ -1546,7 +1546,7 @@ def test_file_handles(self):
assert not m.closed
m.close()
- def test_invalid_file_buffer(self):
+ def test_invalid_file_buffer(self, mock):
# see gh-15337
class InvalidBuffer(object):
@@ -1577,11 +1577,8 @@ def seek(self, pos, whence=0):
tm.assert_frame_equal(result, expected)
- if PY3:
- from unittest import mock
-
- with tm.assert_raises_regex(ValueError, msg):
- self.read_csv(mock.Mock())
+ with tm.assert_raises_regex(ValueError, msg):
+ self.read_csv(mock.Mock())
@tm.capture_stderr
def test_skip_bad_lines(self):
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
new file mode 100644
index 0000000000000..251c93df0733d
--- /dev/null
+++ b/pandas/tests/io/test_gcs.py
@@ -0,0 +1,47 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, date_range, read_csv
+from pandas.compat import StringIO
+from pandas.io.common import is_gcs_url
+from pandas.util import _test_decorators as td
+from pandas.util.testing import assert_frame_equal
+
+
+def test_is_gcs_url():
+ assert is_gcs_url("gcs://pandas/somethingelse.com")
+ assert is_gcs_url("gs://pandas/somethingelse.com")
+ assert not is_gcs_url("s3://pandas/somethingelse.com")
+
+
+@td.skip_if_no('gcsfs')
+def test_read_csv_gcs(mock):
+ df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
+ 'dt': date_range('2018-06-18', periods=2)})
+ with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem:
+ instance = MockFileSystem.return_value
+ instance.open.return_value = StringIO(df1.to_csv(index=False))
+ df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])
+
+ assert_frame_equal(df1, df2)
+
+
+@td.skip_if_no('gcsfs')
+def test_gcs_get_filepath_or_buffer(mock):
+ df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
+ 'dt': date_range('2018-06-18', periods=2)})
+ with mock.patch('pandas.io.gcs.get_filepath_or_buffer') as MockGetFilepath:
+ MockGetFilepath.return_value = (StringIO(df1.to_csv(index=False)),
+ None, None, False)
+ df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])
+
+ assert_frame_equal(df1, df2)
+ assert MockGetFilepath.called
+
+
+@pytest.mark.skipif(td.safe_import('gcsfs'),
+ reason='Only check when gcsfs not installed')
+def test_gcs_not_present_exception():
+ with pytest.raises(ImportError) as e:
+ read_csv('gs://test/test.csv')
+ assert 'gcsfs library is required' in str(e.value)
diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py
index 83c1433bf5c39..01198fc541e0c 100644
--- a/pandas/util/_print_versions.py
+++ b/pandas/util/_print_versions.py
@@ -96,6 +96,7 @@ def show_versions(as_json=False):
("fastparquet", lambda mod: mod.__version__),
("pandas_gbq", lambda mod: mod.__version__),
("pandas_datareader", lambda mod: mod.__version__),
+ ("gcsfs", lambda mod: mod.__version__),
]
deps_blob = list()