|
| 1 | +""" |
| 2 | +Tests for the pandas custom headers in http(s) requests |
| 3 | +""" |
| 4 | +from functools import partial |
| 5 | +import gzip |
| 6 | +from io import BytesIO |
| 7 | + |
| 8 | +import pytest |
| 9 | + |
| 10 | +import pandas.util._test_decorators as td |
| 11 | + |
| 12 | +import pandas as pd |
| 13 | +import pandas._testing as tm |
| 14 | + |
| 15 | +pytestmark = [ |
| 16 | + pytest.mark.single_cpu, |
| 17 | + pytest.mark.network, |
| 18 | + pytest.mark.filterwarnings( |
| 19 | + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" |
| 20 | + ), |
| 21 | +] |
| 22 | + |
| 23 | + |
| 24 | +def gzip_bytes(response_bytes): |
| 25 | + with BytesIO() as bio: |
| 26 | + with gzip.GzipFile(fileobj=bio, mode="w") as zipper: |
| 27 | + zipper.write(response_bytes) |
| 28 | + return bio.getvalue() |
| 29 | + |
| 30 | + |
| 31 | +def csv_responder(df): |
| 32 | + return df.to_csv(index=False).encode("utf-8") |
| 33 | + |
| 34 | + |
| 35 | +def gz_csv_responder(df): |
| 36 | + return gzip_bytes(csv_responder(df)) |
| 37 | + |
| 38 | + |
| 39 | +def json_responder(df): |
| 40 | + return df.to_json().encode("utf-8") |
| 41 | + |
| 42 | + |
| 43 | +def gz_json_responder(df): |
| 44 | + return gzip_bytes(json_responder(df)) |
| 45 | + |
| 46 | + |
| 47 | +def html_responder(df): |
| 48 | + return df.to_html(index=False).encode("utf-8") |
| 49 | + |
| 50 | + |
| 51 | +def parquetpyarrow_reponder(df): |
| 52 | + return df.to_parquet(index=False, engine="pyarrow") |
| 53 | + |
| 54 | + |
| 55 | +def parquetfastparquet_responder(df): |
| 56 | + # the fastparquet engine doesn't like to write to a buffer |
| 57 | + # it can do it via the open_with function being set appropriately |
| 58 | + # however it automatically calls the close method and wipes the buffer |
| 59 | + # so just overwrite that attribute on this instance to not do that |
| 60 | + |
| 61 | + # protected by an importorskip in the respective test |
| 62 | + import fsspec |
| 63 | + |
| 64 | + df.to_parquet( |
| 65 | + "memory://fastparquet_user_agent.parquet", |
| 66 | + index=False, |
| 67 | + engine="fastparquet", |
| 68 | + compression=None, |
| 69 | + ) |
| 70 | + with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f: |
| 71 | + return f.read() |
| 72 | + |
| 73 | + |
| 74 | +def pickle_respnder(df): |
| 75 | + with BytesIO() as bio: |
| 76 | + df.to_pickle(bio) |
| 77 | + return bio.getvalue() |
| 78 | + |
| 79 | + |
| 80 | +def stata_responder(df): |
| 81 | + with BytesIO() as bio: |
| 82 | + df.to_stata(bio, write_index=False) |
| 83 | + return bio.getvalue() |
| 84 | + |
| 85 | + |
| 86 | +@pytest.mark.parametrize( |
| 87 | + "responder, read_method", |
| 88 | + [ |
| 89 | + (csv_responder, pd.read_csv), |
| 90 | + (json_responder, pd.read_json), |
| 91 | + ( |
| 92 | + html_responder, |
| 93 | + lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], |
| 94 | + ), |
| 95 | + pytest.param( |
| 96 | + parquetpyarrow_reponder, |
| 97 | + partial(pd.read_parquet, engine="pyarrow"), |
| 98 | + marks=td.skip_if_no("pyarrow"), |
| 99 | + ), |
| 100 | + pytest.param( |
| 101 | + parquetfastparquet_responder, |
| 102 | + partial(pd.read_parquet, engine="fastparquet"), |
| 103 | + # TODO(ArrayManager) fastparquet |
| 104 | + marks=[ |
| 105 | + td.skip_if_no("fastparquet"), |
| 106 | + td.skip_if_no("fsspec"), |
| 107 | + td.skip_array_manager_not_yet_implemented, |
| 108 | + ], |
| 109 | + ), |
| 110 | + (pickle_respnder, pd.read_pickle), |
| 111 | + (stata_responder, pd.read_stata), |
| 112 | + (gz_csv_responder, pd.read_csv), |
| 113 | + (gz_json_responder, pd.read_json), |
| 114 | + ], |
| 115 | +) |
| 116 | +@pytest.mark.parametrize( |
| 117 | + "storage_options", |
| 118 | + [ |
| 119 | + None, |
| 120 | + {"User-Agent": "foo"}, |
| 121 | + {"User-Agent": "foo", "Auth": "bar"}, |
| 122 | + ], |
| 123 | +) |
| 124 | +def test_request_headers(responder, read_method, httpserver, storage_options): |
| 125 | + expected = pd.DataFrame({"a": ["b"]}) |
| 126 | + default_headers = ["Accept-Encoding", "Host", "Connection", "User-Agent"] |
| 127 | + if "gz" in responder.__name__: |
| 128 | + extra = {"Content-Encoding": "gzip"} |
| 129 | + if storage_options is None: |
| 130 | + storage_options = extra |
| 131 | + else: |
| 132 | + storage_options |= extra |
| 133 | + else: |
| 134 | + extra = None |
| 135 | + expected_headers = set(default_headers).union( |
| 136 | + storage_options.keys() if storage_options else [] |
| 137 | + ) |
| 138 | + httpserver.serve_content(content=responder(expected), headers=extra) |
| 139 | + result = read_method(httpserver.url, storage_options=storage_options) |
| 140 | + tm.assert_frame_equal(result, expected) |
| 141 | + |
| 142 | + request_headers = dict(httpserver.requests[0].headers) |
| 143 | + for header in expected_headers: |
| 144 | + exp = request_headers.pop(header) |
| 145 | + if storage_options and header in storage_options: |
| 146 | + assert exp == storage_options[header] |
| 147 | + # No extra headers added |
| 148 | + assert not request_headers |
| 149 | + |
| 150 | + |
| 151 | +@pytest.mark.parametrize( |
| 152 | + "engine", |
| 153 | + [ |
| 154 | + "pyarrow", |
| 155 | + "fastparquet", |
| 156 | + ], |
| 157 | +) |
| 158 | +def test_to_parquet_to_disk_with_storage_options(engine): |
| 159 | + headers = { |
| 160 | + "User-Agent": "custom", |
| 161 | + "Auth": "other_custom", |
| 162 | + } |
| 163 | + |
| 164 | + pytest.importorskip(engine) |
| 165 | + |
| 166 | + true_df = pd.DataFrame({"column_name": ["column_value"]}) |
| 167 | + msg = ( |
| 168 | + "storage_options passed with file object or non-fsspec file path|" |
| 169 | + "storage_options passed with buffer, or non-supported URL" |
| 170 | + ) |
| 171 | + with pytest.raises(ValueError, match=msg): |
| 172 | + true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine) |
0 commit comments