pandas-dev · jreback · Feb 13, 2018 · Feb 11, 2018 · Feb 11, 2018 · Feb 12, 2018
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -183,7 +183,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
 
     Returns
     -------
-    a filepath_ or buffer or S3File instance, the encoding, the compression
+    tuple of ({a filepath_ or buffer or S3File instance},
+              encoding, str,
+              compression, str,
+              should_close, bool)
     """
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
@@ -194,7 +197,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
             # Override compression based on Content-Encoding header
             compression = 'gzip'
         reader = BytesIO(req.read())
-        return reader, encoding, compression
+        req.close()
+        return reader, encoding, compression, True
 
     if is_s3_url(filepath_or_buffer):
         from pandas.io import s3
@@ -206,13 +210,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     if isinstance(filepath_or_buffer, (compat.string_types,
                                        compat.binary_type,
                                        mmap.mmap)):
-        return _expand_user(filepath_or_buffer), None, compression
+        return _expand_user(filepath_or_buffer), None, compression, False
 
     if not is_file_like(filepath_or_buffer):
         msg = "Invalid file path or buffer object type: {_type}"
         raise ValueError(msg.format(_type=type(filepath_or_buffer)))
 
-    return filepath_or_buffer, None, compression
+    return filepath_or_buffer, None, compression, False
 
 
 def file_path_to_url(path):
@@ -309,6 +313,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     is_text : boolean, default True
         whether file/buffer is in text format (csv, json, etc.), or in binary
         mode (pickle, etc.)
+
     Returns
     -------
     f : file-like

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -381,7 +381,7 @@ def __init__(self, io, **kwds):
         if _is_url(self._io):
             io = _urlopen(self._io)
         elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
-            io, _, _ = get_filepath_or_buffer(self._io)
+            io, _, _, _ = get_filepath_or_buffer(self._io)
 
         if engine == 'xlrd' and isinstance(io, xlrd.Book):
             self.book = io

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -404,7 +404,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     """
 
     compression = _infer_compression(path_or_buf, compression)
-    filepath_or_buffer, _, compression = get_filepath_or_buffer(
+    filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
         path_or_buf, encoding=encoding, compression=compression,
     )
 
@@ -419,7 +419,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     if chunksize:
         return json_reader
 
-    return json_reader.read()
+    result = json_reader.read()
+    if should_close:
+        try:
+            filepath_or_buffer.close()
+        except:  # noqa: flake8
+            pass
+    return result
 
 
 class JsonReader(BaseIterator):

diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -180,14 +180,20 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
     obj : type of object stored in file
 
     """
-    path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
+    path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
     if iterator:
         return Iterator(path_or_buf)
 
     def read(fh):
         l = list(unpack(fh, encoding=encoding, **kwargs))
         if len(l) == 1:
             return l[0]
+
+        if should_close:
+            try:
+                path_or_buf.close()
+            except:  # noqa: flake8
+                pass
         return l
 
     # see if we have an actual file

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -107,7 +107,7 @@ def write(self, df, path, compression='snappy',
         self.validate_dataframe(df)
         if self._pyarrow_lt_070:
             self._validate_write_lt_070(df)
-        path, _, _ = get_filepath_or_buffer(path, mode='wb')
+        path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
 
         if self._pyarrow_lt_060:
             table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
@@ -121,13 +121,21 @@ def write(self, df, path, compression='snappy',
                 coerce_timestamps=coerce_timestamps, **kwargs)
 
     def read(self, path, columns=None, **kwargs):
-        path, _, _ = get_filepath_or_buffer(path)
+        path, _, _, should_close = get_filepath_or_buffer(path)
         if self._pyarrow_lt_070:
-            return self.api.parquet.read_pandas(path, columns=columns,
-                                                **kwargs).to_pandas()
-        kwargs['use_pandas_metadata'] = True
-        return self.api.parquet.read_table(path, columns=columns,
-                                           **kwargs).to_pandas()
+            result = self.api.parquet.read_pandas(path, columns=columns,
+                                                  **kwargs).to_pandas()
+        else:
+            kwargs['use_pandas_metadata'] = True
+            result = self.api.parquet.read_table(path, columns=columns,
+                                                 **kwargs).to_pandas()
+        if should_close:
+            try:
+                path.close()
+            except:  # noqa: flake8
+                pass
+
+        return result
 
     def _validate_write_lt_070(self, df):
         # Compatibility shim for pyarrow < 0.7.0
@@ -199,11 +207,11 @@ def write(self, df, path, compression='snappy', **kwargs):
             # path is s3:// so we need to open the s3file in 'wb' mode.
             # TODO: Support 'ab'
 
-            path, _, _ = get_filepath_or_buffer(path, mode='wb')
+            path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
             # And pass the opened s3file to the fastparquet internal impl.
             kwargs['open_with'] = lambda path, _: path
         else:
-            path, _, _ = get_filepath_or_buffer(path)
+            path, _, _, _ = get_filepath_or_buffer(path)
 
         with catch_warnings(record=True):
             self.api.write(path, df,
@@ -214,13 +222,13 @@ def read(self, path, columns=None, **kwargs):
             # When path is s3:// an S3File is returned.
             # We need to retain the original path(str) while also
             # pass the S3File().open function to fsatparquet impl.
-            s3, _, _ = get_filepath_or_buffer(path)
+            s3, _, _, should_close = get_filepath_or_buffer(path)
             try:
                 parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
             finally:
                 s3.close()
         else:
-            path, _, _ = get_filepath_or_buffer(path)
+            path, _, _, _ = get_filepath_or_buffer(path)
             parquet_file = self.api.ParquetFile(path)
 
         return parquet_file.to_pandas(columns=columns, **kwargs)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -413,7 +413,7 @@ def _read(filepath_or_buffer, kwds):
 
     compression = kwds.get('compression')
     compression = _infer_compression(filepath_or_buffer, compression)
-    filepath_or_buffer, _, compression = get_filepath_or_buffer(
+    filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
         filepath_or_buffer, encoding, compression)
     kwds['compression'] = compression
 
@@ -439,6 +439,13 @@ def _read(filepath_or_buffer, kwds):
         data = parser.read(nrows)
     finally:
         parser.close()
+
+    if should_close:
+        try:
+            filepath_or_buffer.close()
+        except:  # noqa: flake8
+            pass
+
     return data
 
 

diff --git a/pandas/io/s3.py b/pandas/io/s3.py
@@ -27,7 +27,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     fs = s3fs.S3FileSystem(anon=False)
     try:
         filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
-    except (OSError, NoCredentialsError):
+    except (compat.FileNotFoundError, NoCredentialsError):
         # boto3 has troubles when trying to access a public file
         # when credentialed...
         # An OSError is raised if you have credentials, but they
@@ -36,4 +36,4 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
         # for that bucket.
         fs = s3fs.S3FileSystem(anon=True)
         filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
-    return filepath_or_buffer, None, compression
+    return filepath_or_buffer, None, compression, True
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -90,7 +90,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
         self._current_row_on_page_index = 0
         self._current_row_in_file_index = 0
 
-        self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
+        self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
         if isinstance(self._path_or_buf, compat.string_types):
             self._path_or_buf = open(self._path_or_buf, 'rb')
             self.handle = self._path_or_buf

diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
@@ -236,7 +236,8 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
         self._chunksize = chunksize
 
         if isinstance(filepath_or_buffer, str):
-            filepath_or_buffer, encoding, compression = get_filepath_or_buffer(
+            (filepath_or_buffer, encoding,
+             compression, should_close) = get_filepath_or_buffer(
                 filepath_or_buffer, encoding=encoding)
 
         if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -988,7 +988,7 @@ def __init__(self, path_or_buf, convert_dates=True,
         self._native_byteorder = _set_endianness(sys.byteorder)
         path_or_buf = _stringify_path(path_or_buf)
         if isinstance(path_or_buf, str):
-            path_or_buf, encoding, _ = get_filepath_or_buffer(
+            path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
                 path_or_buf, encoding=self._default_encoding
             )
 

diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -2,30 +2,34 @@
 
 import pytest
 from pandas.io.parsers import read_table
+from pandas.util import testing as tm
 
-HERE = os.path.dirname(__file__)
 
+@pytest.fixture
+def parser_data(request):
+    return os.path.join(tm.get_data_path(), '..', 'parser', 'data')
 
-@pytest.fixture(scope='module')
-def tips_file():
+
+@pytest.fixture
+def tips_file(parser_data):
     """Path to the tips dataset"""
-    return os.path.join(HERE, 'parser', 'data', 'tips.csv')
+    return os.path.join(parser_data, 'tips.csv')
 
 
-@pytest.fixture(scope='module')
-def jsonl_file():
+@pytest.fixture
+def jsonl_file(parser_data):
     """Path a JSONL dataset"""
-    return os.path.join(HERE, 'parser', 'data', 'items.jsonl')
+    return os.path.join(parser_data, 'items.jsonl')
 
 
-@pytest.fixture(scope='module')
-def salaries_table():
+@pytest.fixture
+def salaries_table(parser_data):
     """DataFrame with the salaries dataset"""
-    path = os.path.join(HERE, 'parser', 'data', 'salaries.csv')
+    path = os.path.join(parser_data, 'salaries.csv')
     return read_table(path)
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture
 def s3_resource(tips_file, jsonl_file):
     """Fixture for mocking S3 interaction.
 
@@ -41,8 +45,8 @@ def s3_resource(tips_file, jsonl_file):
     is yielded by the fixture.
     """
     pytest.importorskip('s3fs')
+    boto3 = pytest.importorskip('boto3')
     moto = pytest.importorskip('moto')
-    moto.mock_s3().start()
 
     test_s3_files = [
         ('tips.csv', tips_file),
@@ -58,17 +62,22 @@ def add_tips_files(bucket_name):
                     Key=s3_key,
                     Body=f)
 
-    boto3 = pytest.importorskip('boto3')
-    # see gh-16135
-    bucket = 'pandas-test'
+    try:
 
-    conn = boto3.resource("s3", region_name="us-east-1")
-    conn.create_bucket(Bucket=bucket)
-    add_tips_files(bucket)
+        s3 = moto.mock_s3()
+        s3.start()
 
-    conn.create_bucket(Bucket='cant_get_it', ACL='private')
-    add_tips_files('cant_get_it')
+        # see gh-16135
+        bucket = 'pandas-test'
+        conn = boto3.resource("s3", region_name="us-east-1")
 
-    yield conn
+        conn.create_bucket(Bucket=bucket)
+        add_tips_files(bucket)
 
-    moto.mock_s3().stop()
+        conn.create_bucket(Bucket='cant_get_it', ACL='private')
+        add_tips_files('cant_get_it')
+        yield conn
+    except:  # noqa: flake8
+        pytest.skip("failure to use s3 resource")
+    finally:
+        s3.stop()
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -1039,7 +1039,6 @@ def test_read_inline_jsonl(self):
         assert_frame_equal(result, expected)
 
     def test_read_s3_jsonl(self, s3_resource):
-        pytest.importorskip('s3fs')
         # GH17200
 
         result = read_json('s3n://pandas-test/items.jsonl', lines=True)