diff --git a/pandas/io/common.py b/pandas/io/common.py index 51323c5ff3ef5..32ec088f00d88 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -202,9 +202,37 @@ def get_filepath_or_buffer( filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") - file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) - ).open() + # If botocore is installed we fallback to reading with anon=True + # to allow reads from public buckets + err_types_to_retry_with_anon: List[Any] = [] + try: + import_optional_dependency("botocore") + from botocore.exceptions import ClientError, NoCredentialsError + + err_types_to_retry_with_anon = [ + ClientError, + NoCredentialsError, + PermissionError, + ] + except ImportError: + pass + + try: + file_obj = fsspec.open( + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + ).open() + # GH 34626 Reads from Public Buckets without Credentials needs anon=True + except tuple(err_types_to_retry_with_anon): + if storage_options is None: + storage_options = {"anon": True} + else: + # don't mutate user input. + storage_options = dict(storage_options) + storage_options["anon"] = True + file_obj = fsspec.open( + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + ).open() + return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index a76be9465f62a..5e0f7edf4d8ae 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -1,8 +1,12 @@ from io import BytesIO +import os import pytest +import pandas.util._test_decorators as td + from pandas import read_csv +import pandas._testing as tm def test_streaming_s3_objects(): @@ -15,3 +19,30 @@ def test_streaming_s3_objects(): for el in data: body = StreamingBody(BytesIO(el), content_length=len(el)) read_csv(body) + + +@tm.network +@td.skip_if_no("s3fs") +def test_read_without_creds_from_pub_bucket(): + # GH 34626 + # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt + result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3) + assert len(result) == 3 + + +@tm.network +@td.skip_if_no("s3fs") +def test_read_with_creds_from_pub_bucke(): + # Ensure we can read from a public bucket with credentials + # GH 34626 + # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt + + with tm.ensure_safe_environment_variables(): + # temporary workaround as moto fails for botocore >= 1.11 otherwise, + # see https://github.com/spulec/moto/issues/1924 & 1952 + os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") + df = read_csv( + "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None, + ) + assert len(df) == 5