Skip to content

Commit d5e97d0

Browse files
authored
TST: Make test_user_agent run in CI (#56057)
* TST: Make test_user_agent run in CI * Fix module skip name
1 parent caab88b commit d5e97d0

File tree

2 files changed

+172
-403
lines changed

2 files changed

+172
-403
lines changed

pandas/tests/io/test_http_headers.py

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
"""
2+
Tests for the pandas custom headers in http(s) requests
3+
"""
4+
from functools import partial
5+
import gzip
6+
from io import BytesIO
7+
8+
import pytest
9+
10+
import pandas.util._test_decorators as td
11+
12+
import pandas as pd
13+
import pandas._testing as tm
14+
15+
pytestmark = [
16+
pytest.mark.single_cpu,
17+
pytest.mark.network,
18+
pytest.mark.filterwarnings(
19+
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
20+
),
21+
]
22+
23+
24+
def gzip_bytes(response_bytes):
25+
with BytesIO() as bio:
26+
with gzip.GzipFile(fileobj=bio, mode="w") as zipper:
27+
zipper.write(response_bytes)
28+
return bio.getvalue()
29+
30+
31+
def csv_responder(df):
32+
return df.to_csv(index=False).encode("utf-8")
33+
34+
35+
def gz_csv_responder(df):
36+
return gzip_bytes(csv_responder(df))
37+
38+
39+
def json_responder(df):
40+
return df.to_json().encode("utf-8")
41+
42+
43+
def gz_json_responder(df):
44+
return gzip_bytes(json_responder(df))
45+
46+
47+
def html_responder(df):
48+
return df.to_html(index=False).encode("utf-8")
49+
50+
51+
def parquetpyarrow_reponder(df):
52+
return df.to_parquet(index=False, engine="pyarrow")
53+
54+
55+
def parquetfastparquet_responder(df):
56+
# the fastparquet engine doesn't like to write to a buffer
57+
# it can do it via the open_with function being set appropriately
58+
# however it automatically calls the close method and wipes the buffer
59+
# so just overwrite that attribute on this instance to not do that
60+
61+
# protected by an importorskip in the respective test
62+
import fsspec
63+
64+
df.to_parquet(
65+
"memory://fastparquet_user_agent.parquet",
66+
index=False,
67+
engine="fastparquet",
68+
compression=None,
69+
)
70+
with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f:
71+
return f.read()
72+
73+
74+
def pickle_respnder(df):
75+
with BytesIO() as bio:
76+
df.to_pickle(bio)
77+
return bio.getvalue()
78+
79+
80+
def stata_responder(df):
81+
with BytesIO() as bio:
82+
df.to_stata(bio, write_index=False)
83+
return bio.getvalue()
84+
85+
86+
@pytest.mark.parametrize(
87+
"responder, read_method",
88+
[
89+
(csv_responder, pd.read_csv),
90+
(json_responder, pd.read_json),
91+
(
92+
html_responder,
93+
lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0],
94+
),
95+
pytest.param(
96+
parquetpyarrow_reponder,
97+
partial(pd.read_parquet, engine="pyarrow"),
98+
marks=td.skip_if_no("pyarrow"),
99+
),
100+
pytest.param(
101+
parquetfastparquet_responder,
102+
partial(pd.read_parquet, engine="fastparquet"),
103+
# TODO(ArrayManager) fastparquet
104+
marks=[
105+
td.skip_if_no("fastparquet"),
106+
td.skip_if_no("fsspec"),
107+
td.skip_array_manager_not_yet_implemented,
108+
],
109+
),
110+
(pickle_respnder, pd.read_pickle),
111+
(stata_responder, pd.read_stata),
112+
(gz_csv_responder, pd.read_csv),
113+
(gz_json_responder, pd.read_json),
114+
],
115+
)
116+
@pytest.mark.parametrize(
117+
"storage_options",
118+
[
119+
None,
120+
{"User-Agent": "foo"},
121+
{"User-Agent": "foo", "Auth": "bar"},
122+
],
123+
)
124+
def test_request_headers(responder, read_method, httpserver, storage_options):
125+
expected = pd.DataFrame({"a": ["b"]})
126+
default_headers = ["Accept-Encoding", "Host", "Connection", "User-Agent"]
127+
if "gz" in responder.__name__:
128+
extra = {"Content-Encoding": "gzip"}
129+
if storage_options is None:
130+
storage_options = extra
131+
else:
132+
storage_options |= extra
133+
else:
134+
extra = None
135+
expected_headers = set(default_headers).union(
136+
storage_options.keys() if storage_options else []
137+
)
138+
httpserver.serve_content(content=responder(expected), headers=extra)
139+
result = read_method(httpserver.url, storage_options=storage_options)
140+
tm.assert_frame_equal(result, expected)
141+
142+
request_headers = dict(httpserver.requests[0].headers)
143+
for header in expected_headers:
144+
exp = request_headers.pop(header)
145+
if storage_options and header in storage_options:
146+
assert exp == storage_options[header]
147+
# No extra headers added
148+
assert not request_headers
149+
150+
151+
@pytest.mark.parametrize(
152+
"engine",
153+
[
154+
"pyarrow",
155+
"fastparquet",
156+
],
157+
)
158+
def test_to_parquet_to_disk_with_storage_options(engine):
159+
headers = {
160+
"User-Agent": "custom",
161+
"Auth": "other_custom",
162+
}
163+
164+
pytest.importorskip(engine)
165+
166+
true_df = pd.DataFrame({"column_name": ["column_value"]})
167+
msg = (
168+
"storage_options passed with file object or non-fsspec file path|"
169+
"storage_options passed with buffer, or non-supported URL"
170+
)
171+
with pytest.raises(ValueError, match=msg):
172+
true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine)

0 commit comments

Comments
 (0)